From 2adf9484a366162b817ffa6588c00fbd9c2bba7f Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Tue, 12 Jul 2022 12:31:24 +0200 Subject: [PATCH 01/10] Redo fix for NvidiaCollector and MiG. Got lost somehow --- collectors/nvidiaMetric.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/collectors/nvidiaMetric.go b/collectors/nvidiaMetric.go index 458ecd4..0eb7c8a 100644 --- a/collectors/nvidiaMetric.go +++ b/collectors/nvidiaMetric.go @@ -275,7 +275,7 @@ func readUtilization(device NvidiaCollectorDevice, output chan lp.CCMetric) erro err := errors.New(nvml.ErrorString(ret)) return err } - if !isMig { + if isMig { return nil } @@ -620,7 +620,7 @@ func readEncUtilization(device NvidiaCollectorDevice, output chan lp.CCMetric) e err := errors.New(nvml.ErrorString(ret)) return err } - if !isMig { + if isMig { return nil } if !device.excludeMetrics["nv_encoder_util"] { @@ -647,7 +647,7 @@ func readDecUtilization(device NvidiaCollectorDevice, output chan lp.CCMetric) e err := errors.New(nvml.ErrorString(ret)) return err } - if !isMig { + if isMig { return nil } if !device.excludeMetrics["nv_decoder_util"] { From b3c27e0af5ec5980117e429a16d75872f83c927e Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Wed, 13 Jul 2022 10:09:49 +0200 Subject: [PATCH 02/10] Merge latest development changes (#80) * Cleanup: Remove unused code * Use Golang duration parser for 'interval' and 'duration' in main config * Update handling of LIKWID headers. Download only if not already present in the system. Fixes #73 * Units with cc-units (#64) * Add option to normalize units with cc-unit * Add unit conversion to router * Add option to change unit prefix in the router * Add to MetricRouter README * Add order of operations in router to README * Use second add_tags/del_tags only if metric gets renamed * Skip disks in DiskstatCollector that have size=0 * Check readability of sensor files in TempCollector * Fix for --once option * Rename `cpu` type to `hwthread` (#69) * Rename 'cpu' type to 'hwthread' to avoid naming clashes with MetricStore and CC-Webfrontend * Collectors in parallel (#74) * Provide info to CollectorManager whether the collector can be executed in parallel with others * Split serial and parallel collectors. Read in parallel first * Update NvidiaCollector with new metrics, MIG and NvLink support (#75) * CC topology module update (#76) * Rename CPU to hardware thread, write some comments * Do renaming in other parts * Remove CpuList and SocketList function from metricCollector. Available in ccTopology * Option to use MIG UUID as subtype-id in NvidiaCollector * Option to use MIG slice name as subtype-id in NvidiaCollector * MetricRouter: Fix JSON in README * Fix for Github Action to really use the selected version * Remove Ganglia installation in runonce Action and add Go 1.18 * Fix daemon options in init script * Add separate go.mod files to use it with deprecated 1.16 * Minor updates for Makefiles * fix string comparison * AMD ROCm SMI collector (#77) * Add collector for AMD ROCm SMI metrics * Fix import path * Fix imports * Remove Board Number * store GPU index explicitly * Remove board number from description * Use http instead of ftp to download likwid * Fix serial number in rocmCollector * Improved http sink (#78) * automatic flush in NatsSink * tweak default options of HttpSink * shorter cirt. section and retries for HttpSink * fix error handling * Remove file added by mistake. * Use http instead of ftp to download likwid * Fix serial number in rocmCollector Co-authored-by: Thomas Roehl * Fix: When sending metrics failed the batch size could be exceeded * Improved dropping of metrics failed to send * Add memstats and topprocs metric * Updated to latest modules * Check that at least one sink is running * Add drop rate, when send buffer is full * Allow only one timer at a time * Use mutex to ensure only on flush timer is running * Fix for NvidiaCollector when devices are not in MiG mode * Remove Golang version 1.16 an 1.17 from Action. Latest commits require Golang 1.18 * Use Golang 1.18 in Release action to build RPMs * Change unit of CpufreqCollector to Hz. That's what the sysfs outputs * Make wget quiet in Release action to reduce log size Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Co-authored-by: Lou --- .github/workflows/Release.yml | 21 +++++-- .github/workflows/runonce.yml | 54 +---------------- collectors.json | 4 ++ collectors/cpufreqMetric.go | 2 +- go.mod | 32 ++++++++-- go.mod.1.17+ | 13 ++-- sinks/influxSink.go | 110 ++++++++++++++++++++++++---------- sinks/sinkManager.go | 8 ++- 8 files changed, 138 insertions(+), 106 deletions(-) diff --git a/.github/workflows/Release.yml b/.github/workflows/Release.yml index f75a167..f7143cb 100644 --- a/.github/workflows/Release.yml +++ b/.github/workflows/Release.yml @@ -27,7 +27,9 @@ jobs: # Use dnf to install development packages - name: Install development packages - run: dnf --assumeyes group install "Development Tools" "RPM Development Tools" + run: | + dnf --assumeyes group install "Development Tools" "RPM Development Tools" + dnf --assumeyes install wget openssl-devel diffutils delve which # Checkout git repository and submodules # fetch-depth must be 0 to use git describe @@ -41,9 +43,11 @@ jobs: # Use dnf to install build dependencies - name: Install build dependencies run: | - dnf --assumeyes install 'dnf-command(builddep)' - dnf --assumeyes install which - dnf --assumeyes builddep scripts/cc-metric-collector.spec + wget -q http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \ + http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-bin-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \ + http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-src-1.18.2-1.module_el8.7.0+1173+5d37c0fd.noarch.rpm \ + http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/go-toolset-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm + rpm -i go*.rpm - name: RPM build MetricCollector id: rpmbuild @@ -93,7 +97,7 @@ jobs: # Use dnf to install development packages - name: Install development packages - run: dnf --assumeyes --disableplugin=subscription-manager install rpm-build go-srpm-macros rpm-build-libs rpm-libs gcc make python38 git + run: dnf --assumeyes --disableplugin=subscription-manager install rpm-build go-srpm-macros rpm-build-libs rpm-libs gcc make python38 git wget openssl-devel diffutils delve which # Checkout git repository and submodules # fetch-depth must be 0 to use git describe @@ -106,7 +110,12 @@ jobs: # Use dnf to install build dependencies - name: Install build dependencies - run: dnf --assumeyes --disableplugin=subscription-manager builddep scripts/cc-metric-collector.spec + run: | + wget -q http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \ + http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-bin-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm \ + http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/golang-src-1.18.2-1.module_el8.7.0+1173+5d37c0fd.noarch.rpm \ + http://mirror.centos.org/centos/8-stream/AppStream/x86_64/os/Packages/go-toolset-1.18.2-1.module_el8.7.0+1173+5d37c0fd.x86_64.rpm + rpm -i go*.rpm - name: RPM build MetricCollector id: rpmbuild diff --git a/.github/workflows/runonce.yml b/.github/workflows/runonce.yml index a12e002..2036179 100644 --- a/.github/workflows/runonce.yml +++ b/.github/workflows/runonce.yml @@ -31,56 +31,4 @@ jobs: run: make - name: Run MetricCollector once - run: ./cc-metric-collector --once --config .github/ci-config.json - - # - # Job build-1-17 - # Build on latest Ubuntu using golang version 1.17 - # - build-1-17: - runs-on: ubuntu-latest - steps: - # See: https://github.com/marketplace/actions/checkout - # Checkout git repository and submodules - - name: Checkout - uses: actions/checkout@v2 - with: - submodules: recursive - - # See: https://github.com/marketplace/actions/setup-go-environment - - name: Setup Golang - uses: actions/setup-go@v3 - with: - go-version: '1.17.7' - - - name: Build MetricCollector - run: make - - - name: Run MetricCollector once - run: ./cc-metric-collector --once --config .github/ci-config.json - - # - # Job build-1-16 - # Build on latest Ubuntu using golang version 1.16 - # - build-1-16: - runs-on: ubuntu-latest - steps: - # See: https://github.com/marketplace/actions/checkout - # Checkout git repository and submodules - - name: Checkout - uses: actions/checkout@v2 - with: - submodules: recursive - - # See: https://github.com/marketplace/actions/setup-go-environment - - name: Setup Golang - uses: actions/setup-go@v3 - with: - go-version: '1.16.7' # The version AlmaLinux 8.5 uses - - - name: Build MetricCollector - run: make - - - name: Run MetricCollector once - run: ./cc-metric-collector --once --config .github/ci-config.json + run: ./cc-metric-collector --once --config .github/ci-config.json \ No newline at end of file diff --git a/collectors.json b/collectors.json index be3eea7..42cb833 100644 --- a/collectors.json +++ b/collectors.json @@ -12,6 +12,7 @@ "proc_total" ] }, + "memstat": {}, "netstat": { "include_devices": [ "enp5s0" @@ -33,5 +34,8 @@ "type-id": "1" } } + }, + "topprocs": { + "num_procs": 5 } } \ No newline at end of file diff --git a/collectors/cpufreqMetric.go b/collectors/cpufreqMetric.go index 3099900..e6a0081 100644 --- a/collectors/cpufreqMetric.go +++ b/collectors/cpufreqMetric.go @@ -63,7 +63,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error { m.meta = map[string]string{ "source": m.name, "group": "CPU", - "unit": "MHz", + "unit": "Hz", } // Loop for all CPU directories diff --git a/go.mod b/go.mod index a4aacb8..37e242f 100644 --- a/go.mod +++ b/go.mod @@ -3,16 +3,36 @@ module github.com/ClusterCockpit/cc-metric-collector go 1.17 require ( - github.com/ClusterCockpit/cc-units v0.0.0-20220318130935-92a0c6442220 + github.com/ClusterCockpit/cc-units v0.3.0 + github.com/ClusterCockpit/go-rocm-smi v0.3.0 github.com/NVIDIA/go-nvml v0.11.6-0 - github.com/PaesslerAG/gval v1.1.2 + github.com/PaesslerAG/gval v1.2.0 github.com/gorilla/mux v1.8.0 - github.com/influxdata/influxdb-client-go/v2 v2.8.1 + github.com/influxdata/influxdb-client-go/v2 v2.9.0 github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf - github.com/nats-io/nats.go v1.14.0 - github.com/prometheus/client_golang v1.12.1 + github.com/nats-io/nats.go v1.16.0 + github.com/prometheus/client_golang v1.12.2 github.com/stmcginnis/gofish v0.13.0 - golang.org/x/sys v0.0.0-20220412211240-33da011f77ad + golang.org/x/sys v0.0.0-20220622161953-175b2fd9d664 +) + +require ( + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.1.2 // indirect + github.com/deepmap/oapi-codegen v1.8.2 // indirect + github.com/golang/protobuf v1.5.2 // indirect + github.com/matttproud/golang_protobuf_extensions v1.0.1 // indirect + github.com/nats-io/nkeys v0.3.0 // indirect + github.com/nats-io/nuid v1.0.1 // indirect + github.com/pkg/errors v0.9.1 // indirect + github.com/prometheus/client_model v0.2.0 // indirect + github.com/prometheus/common v0.32.1 // indirect + github.com/prometheus/procfs v0.7.3 // indirect + github.com/shopspring/decimal v1.3.1 // indirect + golang.org/x/crypto v0.0.0-20210314154223-e6e6c4f2bb5b // indirect + golang.org/x/net v0.0.0-20210525063256-abc453219eb5 // indirect + google.golang.org/protobuf v1.26.0 // indirect + gopkg.in/yaml.v2 v2.4.0 // indirect ) require ( diff --git a/go.mod.1.17+ b/go.mod.1.17+ index b3dc6ca..5c15a61 100644 --- a/go.mod.1.17+ +++ b/go.mod.1.17+ @@ -3,14 +3,15 @@ module github.com/ClusterCockpit/cc-metric-collector go 1.17 require ( + github.com/ClusterCockpit/cc-units v0.3.0 + github.com/ClusterCockpit/go-rocm-smi v0.3.0 github.com/NVIDIA/go-nvml v0.11.6-0 - github.com/PaesslerAG/gval v1.1.2 + github.com/PaesslerAG/gval v1.2.0 github.com/gorilla/mux v1.8.0 - github.com/influxdata/influxdb-client-go/v2 v2.8.1 + github.com/influxdata/influxdb-client-go/v2 v2.9.0 github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf - github.com/nats-io/nats-server/v2 v2.8.0 // indirect - github.com/nats-io/nats.go v1.14.0 - github.com/prometheus/client_golang v1.12.1 + github.com/nats-io/nats.go v1.16.0 + github.com/prometheus/client_golang v1.12.2 github.com/stmcginnis/gofish v0.13.0 - golang.org/x/sys v0.0.0-20220412211240-33da011f77ad + golang.org/x/sys v0.0.0-20220622161953-175b2fd9d664 ) diff --git a/sinks/influxSink.go b/sinks/influxSink.go index 212647d..90f8da8 100644 --- a/sinks/influxSink.go +++ b/sinks/influxSink.go @@ -29,15 +29,23 @@ type InfluxSink struct { Password string `json:"password,omitempty"` Organization string `json:"organization,omitempty"` SSL bool `json:"ssl,omitempty"` - // Maximum number of points sent to server in single request. Default 100 + // Maximum number of points sent to server in single request. + // Default: 1000 BatchSize int `json:"batch_size,omitempty"` - // Interval, in which is buffer flushed if it has not been already written (by reaching batch size). Default 1s + // Time interval for delayed sending of metrics. + // If the buffers are already filled before the end of this interval, + // the metrics are sent without further delay. + // Default: 1s FlushInterval string `json:"flush_delay,omitempty"` + // Number of metrics that are dropped when buffer is full + // Default: 100 + DropRate int `json:"drop_rate,omitempty"` } - batch []*write.Point - flushTimer *time.Timer - flushDelay time.Duration - lock sync.Mutex // Flush() runs in another goroutine, so this lock has to protect the buffer + batch []*write.Point + flushTimer *time.Timer + flushDelay time.Duration + batchMutex sync.Mutex // Flush() runs in another goroutine, so this lock has to protect the buffer + flushTimerMutex sync.Mutex // Ensure only one flush timer is running } // connect connects to the InfluxDB server @@ -62,7 +70,10 @@ func (s *InfluxSink) connect() error { } else { auth = fmt.Sprintf("%s:%s", s.config.User, s.config.Password) } - cclog.ComponentDebug(s.name, "Using URI", uri, "Org", s.config.Organization, "Bucket", s.config.Database) + cclog.ComponentDebug(s.name, + "Using URI='"+uri+"'", + "Org='"+s.config.Organization+"'", + "Bucket='"+s.config.Database+"'") // Set influxDB client options clientOptions := influxdb2.DefaultOptions() @@ -93,47 +104,64 @@ func (s *InfluxSink) connect() error { func (s *InfluxSink) Write(m lp.CCMetric) error { - if len(s.batch) == 0 && s.flushDelay != 0 { - // This is the first write since the last flush, start the flushTimer! - if s.flushTimer != nil && s.flushTimer.Stop() { - cclog.ComponentDebug(s.name, "unexpected: the flushTimer was already running?") - } - - // Run a batched flush for all lines that have arrived in the last flush delay interval + if s.flushDelay != 0 && s.flushTimerMutex.TryLock() { + // Run a batched flush for all metrics that arrived in the last flush delay interval + cclog.ComponentDebug(s.name, "Starting new flush timer") s.flushTimer = time.AfterFunc( s.flushDelay, func() { + defer s.flushTimerMutex.Unlock() + cclog.ComponentDebug(s.name, "Starting flush in flush timer") if err := s.Flush(); err != nil { - cclog.ComponentError(s.name, "flush failed:", err.Error()) + cclog.ComponentError(s.name, "Flush timer: flush failed:", err) } }) } + // Lock access to batch slice + s.batchMutex.Lock() + + // batch slice full, dropping oldest metric(s) + // e.g. when previous flushes failed and batch slice was not cleared + if len(s.batch) == s.config.BatchSize { + newSize := s.config.BatchSize - s.config.DropRate + + for i := 0; i < newSize; i++ { + s.batch[i] = s.batch[i+s.config.DropRate] + } + for i := newSize; i < s.config.BatchSize; i++ { + s.batch[i] = nil + } + s.batch = s.batch[:newSize] + cclog.ComponentError(s.name, "Batch slice full, dropping", s.config.DropRate, "oldest metric(s)") + } + // Append metric to batch slice p := m.ToPoint(s.meta_as_tags) - s.lock.Lock() s.batch = append(s.batch, p) - s.lock.Unlock() // Flush synchronously if "flush_delay" is zero - if s.flushDelay == 0 { - return s.Flush() - } - + // or // Flush if batch size is reached - if len(s.batch) == s.config.BatchSize { + if s.flushDelay == 0 || + len(s.batch) == s.config.BatchSize { + // Unlock access to batch slice + s.batchMutex.Unlock() return s.Flush() } + // Unlock access to batch slice + s.batchMutex.Unlock() return nil } // Flush sends all metrics buffered in batch slice to InfluxDB server func (s *InfluxSink) Flush() error { + cclog.ComponentDebug(s.name, "Flushing") // Lock access to batch slice - s.lock.Lock() - defer s.lock.Unlock() + s.batchMutex.Lock() + defer s.batchMutex.Unlock() // Nothing to do, batch slice is empty if len(s.batch) == 0 { @@ -143,7 +171,7 @@ func (s *InfluxSink) Flush() error { // Send metrics from batch slice err := s.writeApi.WritePoint(context.Background(), s.batch...) if err != nil { - cclog.ComponentError(s.name, "flush failed:", err.Error()) + cclog.ComponentError(s.name, "Flush(): Flush of", len(s.batch), "metrics failed:", err) return err } @@ -160,6 +188,9 @@ func (s *InfluxSink) Close() { cclog.ComponentDebug(s.name, "Closing InfluxDB connection") s.flushTimer.Stop() s.Flush() + if err := s.Flush(); err != nil { + cclog.ComponentError(s.name, "Close(): Flush failed:", err) + } s.client.Close() } @@ -169,31 +200,32 @@ func NewInfluxSink(name string, config json.RawMessage) (Sink, error) { s.name = fmt.Sprintf("InfluxSink(%s)", name) // Set config default values - s.config.BatchSize = 100 + s.config.BatchSize = 1000 s.config.FlushInterval = "1s" + s.config.DropRate = 100 // Read config if len(config) > 0 { err := json.Unmarshal(config, &s.config) if err != nil { - return nil, err + return s, err } } if len(s.config.Host) == 0 { - return nil, errors.New("Missing host configuration required by InfluxSink") + return s, errors.New("Missing host configuration required by InfluxSink") } if len(s.config.Port) == 0 { - return nil, errors.New("Missing port configuration required by InfluxSink") + return s, errors.New("Missing port configuration required by InfluxSink") } if len(s.config.Database) == 0 { - return nil, errors.New("Missing database configuration required by InfluxSink") + return s, errors.New("Missing database configuration required by InfluxSink") } if len(s.config.Organization) == 0 { - return nil, errors.New("Missing organization configuration required by InfluxSink") + return s, errors.New("Missing organization configuration required by InfluxSink") } if len(s.config.Password) == 0 { - return nil, errors.New("Missing password configuration required by InfluxSink") + return s, errors.New("Missing password configuration required by InfluxSink") } // Create lookup map to use meta infos as tags in the output metric @@ -210,12 +242,24 @@ func NewInfluxSink(name string, config json.RawMessage) (Sink, error) { } } + if !(s.config.BatchSize > 0) { + return s, fmt.Errorf("batch_size=%d in InfluxDB config must be > 0", s.config.BatchSize) + } + if !(s.config.DropRate > 0) { + return s, fmt.Errorf("drop_rate=%d in InfluxDB config must be > 0", s.config.DropRate) + } + if !(s.config.BatchSize > s.config.DropRate) { + return s, fmt.Errorf( + "batch_size=%d must be greater then drop_rate=%d in InfluxDB config", + s.config.BatchSize, s.config.DropRate) + } + // allocate batch slice s.batch = make([]*write.Point, 0, s.config.BatchSize) // Connect to InfluxDB server if err := s.connect(); err != nil { - return nil, fmt.Errorf("unable to connect: %v", err) + return s, fmt.Errorf("unable to connect: %v", err) } return s, nil } diff --git a/sinks/sinkManager.go b/sinks/sinkManager.go index f531f5d..6af8614 100644 --- a/sinks/sinkManager.go +++ b/sinks/sinkManager.go @@ -76,11 +76,17 @@ func (sm *sinkManager) Init(wg *sync.WaitGroup, sinkConfigFile string) error { for name, raw := range rawConfigs { err = sm.AddOutput(name, raw) if err != nil { - cclog.ComponentError("SinkManager", err.Error()) + cclog.ComponentError("SinkManager", err) continue } } + // Check that at least one sink is running + if !(len(sm.sinks) > 0) { + cclog.ComponentError("SinkManager", "Found no usable sinks") + return fmt.Errorf("Found no usable sinks") + } + return nil } From 88fabc2e83bdc55fad8430a99d02c1a35ba4cfe3 Mon Sep 17 00:00:00 2001 From: oscarminus Date: Wed, 7 Sep 2022 14:09:29 +0200 Subject: [PATCH 03/10] cpustatMetric.go: Use derived values instead of absolute values (#83) * cpustatMetric.go: Use derived values instead of absolute values The values in /proc/stat are absolute counters related to the boot time of the system. To obtain a utilization of the CPU, the changes in the counters must be derived according to time. To take only the absolute values leads to the fact that changes in the utilization, straight with larger values, do not become visible. * Add new collector for /proc/schedstat The `schedstat` collector reads data from /proc/schedstat and calculates a load value, separated by hwthread. This might be useful to detect bad cpu pinning on shared nodes etc. Co-authored-by: Michael Schwarz --- collectors/collectorManager.go | 1 + collectors/cpustatMetric.go | 40 ++++++--- collectors/schedstatMetric.go | 155 +++++++++++++++++++++++++++++++++ collectors/schedstatMetric.md | 11 +++ 4 files changed, 197 insertions(+), 10 deletions(-) create mode 100644 collectors/schedstatMetric.go create mode 100644 collectors/schedstatMetric.md diff --git a/collectors/collectorManager.go b/collectors/collectorManager.go index 49a9db8..63d0cb4 100644 --- a/collectors/collectorManager.go +++ b/collectors/collectorManager.go @@ -37,6 +37,7 @@ var AvailableCollectors = map[string]MetricCollector{ "beegfs_meta": new(BeegfsMetaCollector), "beegfs_storage": new(BeegfsStorageCollector), "rocm_smi": new(RocmSmiCollector), + "schedstat": new(SchedstatCollector), } // Metric collector manager data structure diff --git a/collectors/cpustatMetric.go b/collectors/cpustatMetric.go index c0dcf13..3c09b83 100644 --- a/collectors/cpustatMetric.go +++ b/collectors/cpustatMetric.go @@ -11,6 +11,7 @@ import ( cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + sysconf "github.com/tklauser/go-sysconf" ) const CPUSTATFILE = `/proc/stat` @@ -22,9 +23,11 @@ type CpustatCollectorConfig struct { type CpustatCollector struct { metricCollector config CpustatCollectorConfig + lastTimestamp time.Time // Store time stamp of last tick to derive values matches map[string]int cputags map[string]map[string]string nodetags map[string]string + olddata map[string]map[string]int64 } func (m *CpustatCollector) Init(config json.RawMessage) error { @@ -76,36 +79,48 @@ func (m *CpustatCollector) Init(config json.RawMessage) error { // Pre-generate tags for all CPUs num_cpus := 0 m.cputags = make(map[string]map[string]string) + m.olddata = make(map[string]map[string]int64) scanner := bufio.NewScanner(file) for scanner.Scan() { line := scanner.Text() linefields := strings.Fields(line) - if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 { + if strings.Compare(linefields[0], "cpu") == 0 { + m.olddata["cpu"] = make(map[string]int64) + for k, v := range m.matches { + m.olddata["cpu"][k], _ = strconv.ParseInt(linefields[v], 0, 64) + } + } else if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 { cpustr := strings.TrimLeft(linefields[0], "cpu") cpu, _ := strconv.Atoi(cpustr) m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)} + m.olddata[linefields[0]] = make(map[string]int64) + for k, v := range m.matches { + m.olddata[linefields[0]][k], _ = strconv.ParseInt(linefields[v], 0, 64) + } num_cpus++ } } + m.lastTimestamp = time.Now() m.init = true return nil } -func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]string, output chan lp.CCMetric) { +func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]string, output chan lp.CCMetric, now time.Time, tsdelta time.Duration) { values := make(map[string]float64) - total := 0.0 + clktck, _ := sysconf.Sysconf(sysconf.SC_CLK_TCK) for match, index := range m.matches { if len(match) > 0 { x, err := strconv.ParseInt(linefields[index], 0, 64) if err == nil { - values[match] = float64(x) - total += values[match] + vdiff := x - m.olddata[linefields[0]][match] + m.olddata[linefields[0]][match] = x // Store new value for next run + values[match] = float64(vdiff) / float64(tsdelta.Seconds()) / float64(clktck) } } } - t := time.Now() + for name, value := range values { - y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": (value * 100.0) / total}, t) + y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": value * 100}, now) if err == nil { output <- y } @@ -117,6 +132,9 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMetric) return } num_cpus := 0 + now := time.Now() + tsdelta := now.Sub(m.lastTimestamp) + file, err := os.Open(string(CPUSTATFILE)) if err != nil { cclog.ComponentError(m.name, err.Error()) @@ -128,9 +146,9 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMetric) line := scanner.Text() linefields := strings.Fields(line) if strings.Compare(linefields[0], "cpu") == 0 { - m.parseStatLine(linefields, m.nodetags, output) + m.parseStatLine(linefields, m.nodetags, output, now, tsdelta) } else if strings.HasPrefix(linefields[0], "cpu") { - m.parseStatLine(linefields, m.cputags[linefields[0]], output) + m.parseStatLine(linefields, m.cputags[linefields[0]], output, now, tsdelta) num_cpus++ } } @@ -139,11 +157,13 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMetric) m.nodetags, m.meta, map[string]interface{}{"value": int(num_cpus)}, - time.Now(), + now, ) if err == nil { output <- num_cpus_metric } + + m.lastTimestamp = now } func (m *CpustatCollector) Close() { diff --git a/collectors/schedstatMetric.go b/collectors/schedstatMetric.go new file mode 100644 index 0000000..e3041ae --- /dev/null +++ b/collectors/schedstatMetric.go @@ -0,0 +1,155 @@ +package collectors + +import ( + "encoding/json" + "fmt" + "bufio" + "time" + "os" + "strings" + "strconv" + "math" + + cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" +) + +const SCHEDSTATFILE = `/proc/schedstat` + +// These are the fields we read from the JSON configuration +type SchedstatCollectorConfig struct { + ExcludeMetrics []string `json:"exclude_metrics,omitempty"` +} + +// This contains all variables we need during execution and the variables +// defined by metricCollector (name, init, ...) +type SchedstatCollector struct { + metricCollector + config SchedstatCollectorConfig // the configuration structure + lastTimestamp time.Time // Store time stamp of last tick to derive values + meta map[string]string // default meta information + cputags map[string]map[string]string // default tags + olddata map[string]map[string]int64 // default tags +} + +// Functions to implement MetricCollector interface +// Init(...), Read(...), Close() +// See: metricCollector.go + +// Init initializes the sample collector +// Called once by the collector manager +// All tags, meta data tags and metrics that do not change over the runtime should be set here +func (m *SchedstatCollector) Init(config json.RawMessage) error { + var err error = nil + // Always set the name early in Init() to use it in cclog.Component* functions + m.name = "SchedstatCollector" + // This is for later use, also call it early + m.setup() + // Tell whether the collector should be run in parallel with others (reading files, ...) + // or it should be run serially, mostly for collectors acutally doing measurements + // because they should not measure the execution of the other collectors + m.parallel = true + // Define meta information sent with each metric + // (Can also be dynamic or this is the basic set with extension through AddMeta()) + m.meta = map[string]string{"source": m.name, "group": "SCHEDSTAT"} + + // Read in the JSON configuration + if len(config) > 0 { + err = json.Unmarshal(config, &m.config) + if err != nil { + cclog.ComponentError(m.name, "Error reading config:", err.Error()) + return err + } + } + + // Check input file + file, err := os.Open(string(SCHEDSTATFILE)) + if err != nil { + cclog.ComponentError(m.name, err.Error()) + } + defer file.Close() + + // Pre-generate tags for all CPUs + num_cpus := 0 + m.cputags = make(map[string]map[string]string) + m.olddata = make(map[string]map[string]int64) + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() + linefields := strings.Fields(line) + if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 { + cpustr := strings.TrimLeft(linefields[0], "cpu") + cpu, _ := strconv.Atoi(cpustr) + running, _ := strconv.ParseInt(linefields[7], 10, 64) + waiting, _ := strconv.ParseInt(linefields[8], 10, 64) + m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)} + m.olddata[linefields[0]] = map[string]int64{"running" : running, "waiting" : waiting} + num_cpus++ + } + } + + + // Save current timestamp + m.lastTimestamp = time.Now() + + // Set this flag only if everything is initialized properly, all required files exist, ... + m.init = true + return err +} + +func (m *SchedstatCollector) ParseProcLine(linefields []string, tags map[string]string, output chan lp.CCMetric, now time.Time, tsdelta time.Duration) { + running, _ := strconv.ParseInt(linefields[7], 10, 64) + waiting, _ := strconv.ParseInt(linefields[8], 10, 64) + diff_running := running - m.olddata[linefields[0]]["running"] + diff_waiting := waiting - m.olddata[linefields[0]]["waiting"] + + var l_running float64 = float64(diff_running) / tsdelta.Seconds() / (math.Pow(1000, 3)) + var l_waiting float64 = float64(diff_waiting) / tsdelta.Seconds() / (math.Pow(1000, 3)) + + m.olddata[linefields[0]]["running"] = running + m.olddata[linefields[0]]["waiting"] = waiting + value := l_running + l_waiting + + y, err := lp.New("cpu_load_core", tags, m.meta, map[string]interface{}{"value": value}, now) + if err == nil { + // Send it to output channel + output <- y + } +} + +// Read collects all metrics belonging to the sample collector +// and sends them through the output channel to the collector manager +func (m *SchedstatCollector) Read(interval time.Duration, output chan lp.CCMetric) { + if !m.init { + return + } + + //timestamps + now := time.Now() + tsdelta := now.Sub(m.lastTimestamp) + + file, err := os.Open(string(SCHEDSTATFILE)) + if err != nil { + cclog.ComponentError(m.name, err.Error()) + } + defer file.Close() + + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() + linefields := strings.Fields(line) + if strings.HasPrefix(linefields[0], "cpu") { + m.ParseProcLine(linefields, m.cputags[linefields[0]], output, now, tsdelta) + } + } + + m.lastTimestamp = now + +} + +// Close metric collector: close network connection, close files, close libraries, ... +// Called once by the collector manager +func (m *SchedstatCollector) Close() { + // Unset flag + m.init = false +} diff --git a/collectors/schedstatMetric.md b/collectors/schedstatMetric.md new file mode 100644 index 0000000..6369eca --- /dev/null +++ b/collectors/schedstatMetric.md @@ -0,0 +1,11 @@ + +## `schedstat` collector +```json + "schedstat": { + } +``` + +The `schedstat` collector reads data from /proc/schedstat and calculates a load value, separated by hwthread. This might be useful to detect bad cpu pinning on shared nodes etc. + +Metric: +* `cpu_load_core` \ No newline at end of file From 5b6a2b9018b821b82afc37cfe87e9e55fd095b60 Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Mon, 12 Sep 2022 12:54:40 +0200 Subject: [PATCH 04/10] Merge latest fixed from `develop` to `main` (#85) * InfiniBandCollector: Scale raw readings from octets to bytes * Fix clock frequency coming from LikwidCollector and update docs --- collectors/infinibandMetric.go | 15 +++++++++------ collectors/likwidMetric.go | 7 ++++--- collectors/likwidMetric.md | 3 +++ 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/collectors/infinibandMetric.go b/collectors/infinibandMetric.go index 92ea911..d6613c5 100644 --- a/collectors/infinibandMetric.go +++ b/collectors/infinibandMetric.go @@ -19,8 +19,9 @@ import ( const IB_BASEPATH = "/sys/class/infiniband/" type InfinibandCollectorMetric struct { - path string - unit string + path string + unit string + scale int64 } type InfinibandCollectorInfo struct { @@ -113,10 +114,10 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error { // Check access to counter files countersDir := filepath.Join(path, "counters") portCounterFiles := map[string]InfinibandCollectorMetric{ - "ib_recv": {path: filepath.Join(countersDir, "port_rcv_data"), unit: "bytes"}, - "ib_xmit": {path: filepath.Join(countersDir, "port_xmit_data"), unit: "bytes"}, - "ib_recv_pkts": {path: filepath.Join(countersDir, "port_rcv_packets"), unit: "packets"}, - "ib_xmit_pkts": {path: filepath.Join(countersDir, "port_xmit_packets"), unit: "packets"}, + "ib_recv": {path: filepath.Join(countersDir, "port_rcv_data"), unit: "bytes", scale: 4}, + "ib_xmit": {path: filepath.Join(countersDir, "port_xmit_data"), unit: "bytes", scale: 4}, + "ib_recv_pkts": {path: filepath.Join(countersDir, "port_rcv_packets"), unit: "packets", scale: 1}, + "ib_xmit_pkts": {path: filepath.Join(countersDir, "port_xmit_packets"), unit: "packets", scale: 1}, } for _, counter := range portCounterFiles { err := unix.Access(counter.path, unix.R_OK) @@ -191,6 +192,8 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr fmt.Sprintf("Read(): Failed to convert Infininiband metrice %s='%s' to int64: %v", counterName, data, err)) continue } + // Scale raw value + v *= counterDef.scale // Send absolut values if m.config.SendAbsoluteValues { diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index c036415..f22d486 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -159,7 +159,8 @@ func getBaseFreq() float64 { data := strings.Replace(string(buffer), "\n", "", -1) x, err := strconv.ParseInt(data, 0, 64) if err == nil { - freq = float64(x) * 1e6 + freq = float64(x) + break } } } @@ -168,11 +169,11 @@ func getBaseFreq() float64 { C.power_init(0) info := C.get_powerInfo() if float64(info.baseFrequency) != 0 { - freq = float64(info.baseFrequency) * 1e6 + freq = float64(info.baseFrequency) } C.power_finalize() } - return freq + return freq * 1e3 } func (m *LikwidCollector) Init(config json.RawMessage) error { diff --git a/collectors/likwidMetric.md b/collectors/likwidMetric.md index 1bb211f..54640dc 100644 --- a/collectors/likwidMetric.md +++ b/collectors/likwidMetric.md @@ -7,6 +7,9 @@ The `likwid` collector is probably the most complicated collector. The LIKWID li "likwid": { "force_overwrite" : false, "invalid_to_zero" : false, + "liblikwid_path" : "/path/to/liblikwid.so", + "accessdaemon_path" : "/folder/that/contains/likwid-accessD", + "access_mode" : "direct or accessdaemon or perf_event", "eventsets": [ { "events" : { From be20f956c25d1b216c1b1cb8ec04dcc70834a60c Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Mon, 10 Oct 2022 12:23:51 +0200 Subject: [PATCH 05/10] Add latest development to main branch (#89) * InfiniBandCollector: Scale raw readings from octets to bytes * Fix clock frequency coming from LikwidCollector and update docs * Build DEB package for Ubuntu 20.04 for releases * Fix memstat collector with numa_stats option * Remove useless prints from MemstatCollector * Replace ioutils with os and io (#87) * Use lower case for error strings in RocmSmiCollector * move maybe-usable-by-other-cc-components to pkg. Fix all files to use the new paths (#88) * Add collector for monitoring the execution of cc-metric-collector itself (#81) * Add collector to monitor execution of cc-metric-collector itself * Register SelfCollector * Fix import paths for moved packages --- .github/workflows/Release.yml | 61 +++++++- cc-metric-collector.go | 6 +- collectors/beegfsmetaMetric.go | 12 +- collectors/beegfsstorageMetric.go | 12 +- collectors/collectorManager.go | 9 +- collectors/cpufreqCpuinfoMetric.go | 4 +- collectors/cpufreqMetric.go | 12 +- collectors/cpustatMetric.go | 18 +-- collectors/customCmdMetric.go | 8 +- collectors/diskstatMetric.go | 4 +- collectors/gpfsMetric.go | 10 +- collectors/infinibandMetric.go | 9 +- collectors/iostatMetric.go | 4 +- collectors/ipmiMetric.go | 2 +- collectors/likwidMetric.go | 9 +- collectors/loadavgMetric.go | 8 +- collectors/lustreMetric.go | 4 +- collectors/memstatMetric.go | 28 ++-- collectors/metricCollector.go | 2 +- collectors/netstatMetric.go | 4 +- collectors/nfsMetric.go | 2 +- collectors/numastatsMetric.go | 4 +- collectors/nvidiaMetric.go | 4 +- collectors/rocmsmiMetric.go | 12 +- collectors/sampleMetric.go | 4 +- collectors/sampleTimerMetric.go | 4 +- collectors/schedstatMetric.go | 35 +++-- collectors/selfMetric.go | 144 ++++++++++++++++++ collectors/selfMetric.md | 34 +++++ collectors/tempMetric.go | 18 +-- collectors/topprocsMetric.go | 2 +- go.mod | 2 + go.sum | 5 + internal/metricAggregator/metricAggregator.go | 6 +- .../metricAggregatorFunctions.go | 4 +- internal/metricRouter/metricCache.go | 6 +- internal/metricRouter/metricRouter.go | 6 +- {internal => pkg}/ccLogger/cclogger.go | 0 {internal => pkg}/ccMetric/README.md | 0 {internal => pkg}/ccMetric/ccMetric.go | 0 {internal => pkg}/ccTopology/ccTopology.go | 2 +- {internal => pkg}/multiChanTicker/README.md | 0 .../multiChanTicker/multiChanTicker.go | 2 +- receivers/httpReceiver.go | 4 +- receivers/metricReceiver.go | 2 +- receivers/natsReceiver.go | 4 +- receivers/prometheusReceiver.go | 4 +- receivers/receiveManager.go | 4 +- receivers/redfishReceiver.go | 4 +- receivers/sampleReceiver.go | 2 +- sinks/gangliaCommon.go | 2 +- sinks/gangliaSink.go | 4 +- sinks/httpSink.go | 4 +- sinks/influxAsyncSink.go | 4 +- sinks/influxSink.go | 4 +- sinks/libgangliaSink.go | 4 +- sinks/metricSink.go | 2 +- sinks/natsSink.go | 4 +- sinks/prometheusSink.go | 4 +- sinks/sampleSink.go | 4 +- sinks/sinkManager.go | 4 +- sinks/stdoutSink.go | 2 +- 62 files changed, 417 insertions(+), 171 deletions(-) create mode 100644 collectors/selfMetric.go create mode 100644 collectors/selfMetric.md rename {internal => pkg}/ccLogger/cclogger.go (100%) rename {internal => pkg}/ccMetric/README.md (100%) rename {internal => pkg}/ccMetric/ccMetric.go (100%) rename {internal => pkg}/ccTopology/ccTopology.go (99%) rename {internal => pkg}/multiChanTicker/README.md (100%) rename {internal => pkg}/multiChanTicker/multiChanTicker.go (93%) diff --git a/.github/workflows/Release.yml b/.github/workflows/Release.yml index f7143cb..1d1906b 100644 --- a/.github/workflows/Release.yml +++ b/.github/workflows/Release.yml @@ -133,13 +133,63 @@ jobs: name: cc-metric-collector SRPM for UBI 8 path: ${{ steps.rpmbuild.outputs.SRPM }} + # + # Build on Ubuntu 20.04 using official go package + # + Ubuntu-focal-build: + runs-on: ubuntu-latest + container: ubuntu:20.04 + # The job outputs link to the outputs of the 'debrename' step + # Only job outputs can be used in child jobs + outputs: + deb : ${{steps.debrename.outputs.DEB}} + steps: + # Use apt to install development packages + - name: Install development packages + run: | + apt update && apt --assume-yes upgrade + apt --assume-yes install build-essential sed git wget bash + # Checkout git repository and submodules + # fetch-depth must be 0 to use git describe + # See: https://github.com/marketplace/actions/checkout + - name: Checkout + uses: actions/checkout@v2 + with: + submodules: recursive + fetch-depth: 0 + # Use official golang package + - name: Install Golang + run: | + wget -q https://go.dev/dl/go1.19.1.linux-amd64.tar.gz + tar -C /usr/local -xzf go1.19.1.linux-amd64.tar.gz + export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH + go version + - name: DEB build MetricCollector + id: dpkg-build + run: | + export PATH=/usr/local/go/bin:/usr/local/go/pkg/tool/linux_amd64:$PATH + make DEB + - name: Rename DEB (add '_ubuntu20.04') + id: debrename + run: | + OLD_DEB_NAME=$(echo "${{steps.dpkg-build.outputs.DEB}}" | rev | cut -d '.' -f 2- | rev) + NEW_DEB_FILE="${OLD_DEB_NAME}_ubuntu20.04.deb" + mv "${{steps.dpkg-build.outputs.DEB}}" "${NEW_DEB_FILE}" + echo "::set-output name=DEB::${NEW_DEB_FILE}" + # See: https://github.com/actions/upload-artifact + - name: Save DEB as artifact + uses: actions/upload-artifact@v2 + with: + name: cc-metric-collector DEB for Ubuntu 20.04 + path: ${{ steps.debrename.outputs.DEB }} + # # Create release with fresh RPMs # Release: runs-on: ubuntu-latest # We need the RPMs, so add dependency - needs: [AlmaLinux-RPM-build, UBI-8-RPM-build] + needs: [AlmaLinux-RPM-build, UBI-8-RPM-build, Ubuntu-focal-build] steps: # See: https://github.com/actions/download-artifact @@ -161,6 +211,11 @@ jobs: with: name: cc-metric-collector SRPM for UBI 8 + - name: Download Ubuntu 20.04 DEB + uses: actions/download-artifact@v2 + with: + name: cc-metric-collector DEB for Ubuntu 20.04 + # The download actions do not publish the name of the downloaded file, # so we re-use the job outputs of the parent jobs. The files are all # downloaded to the current folder. @@ -174,14 +229,17 @@ jobs: ALMA_85_SRPM=$(basename "${{ needs.AlmaLinux-RPM-build.outputs.srpm}}") UBI_8_RPM=$(basename "${{ needs.UBI-8-RPM-build.outputs.rpm}}") UBI_8_SRPM=$(basename "${{ needs.UBI-8-RPM-build.outputs.srpm}}") + U_2004_DEB=$(basename "${{ needs.Ubuntu-focal-build.outputs.deb}}") echo "ALMA_85_RPM::${ALMA_85_RPM}" echo "ALMA_85_SRPM::${ALMA_85_SRPM}" echo "UBI_8_RPM::${UBI_8_RPM}" echo "UBI_8_SRPM::${UBI_8_SRPM}" + echo "U_2004_DEB::${U_2004_DEB}" echo "::set-output name=ALMA_85_RPM::${ALMA_85_RPM}" echo "::set-output name=ALMA_85_SRPM::${ALMA_85_SRPM}" echo "::set-output name=UBI_8_RPM::${UBI_8_RPM}" echo "::set-output name=UBI_8_SRPM::${UBI_8_SRPM}" + echo "::set-output name=U_2004_DEB::${U_2004_DEB}" # See: https://github.com/softprops/action-gh-release - name: Release @@ -194,3 +252,4 @@ jobs: ${{ steps.files.outputs.ALMA_85_SRPM }} ${{ steps.files.outputs.UBI_8_RPM }} ${{ steps.files.outputs.UBI_8_SRPM }} + ${{ steps.files.outputs.U_2004_DEB }} diff --git a/cc-metric-collector.go b/cc-metric-collector.go index 42f7843..5544af8 100644 --- a/cc-metric-collector.go +++ b/cc-metric-collector.go @@ -15,10 +15,10 @@ import ( "sync" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" mr "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter" - mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker" ) type CentralConfigFile struct { diff --git a/collectors/beegfsmetaMetric.go b/collectors/beegfsmetaMetric.go index a27faf2..d202773 100644 --- a/collectors/beegfsmetaMetric.go +++ b/collectors/beegfsmetaMetric.go @@ -5,7 +5,7 @@ import ( "bytes" "encoding/json" "fmt" - "io/ioutil" + "io" "os" "os/exec" "os/user" @@ -14,8 +14,8 @@ import ( "strings" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) const DEFAULT_BEEGFS_CMD = "beegfs-ctl" @@ -115,7 +115,7 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMetr return } //get mounpoint - buffer, _ := ioutil.ReadFile(string("/proc/mounts")) + buffer, _ := os.ReadFile(string("/proc/mounts")) mounts := strings.Split(string(buffer), "\n") var mountpoints []string for _, line := range mounts { @@ -157,9 +157,9 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMetr if err != nil { fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): Failed to execute command \"%s\": %s\n", cmd.String(), err.Error()) fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode()) - data, _ := ioutil.ReadAll(cmdStderr) + data, _ := io.ReadAll(cmdStderr) fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command stderr: \"%s\"\n", string(data)) - data, _ = ioutil.ReadAll(cmdStdout) + data, _ = io.ReadAll(cmdStdout) fmt.Fprintf(os.Stderr, "BeegfsMetaCollector.Read(): command stdout: \"%s\"\n", string(data)) return } diff --git a/collectors/beegfsstorageMetric.go b/collectors/beegfsstorageMetric.go index 1160664..be57e0f 100644 --- a/collectors/beegfsstorageMetric.go +++ b/collectors/beegfsstorageMetric.go @@ -5,7 +5,7 @@ import ( "bytes" "encoding/json" "fmt" - "io/ioutil" + "io" "os" "os/exec" "os/user" @@ -14,8 +14,8 @@ import ( "strings" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) // Struct for the collector-specific JSON config @@ -108,7 +108,7 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM return } //get mounpoint - buffer, _ := ioutil.ReadFile(string("/proc/mounts")) + buffer, _ := os.ReadFile(string("/proc/mounts")) mounts := strings.Split(string(buffer), "\n") var mountpoints []string for _, line := range mounts { @@ -149,9 +149,9 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM if err != nil { fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): Failed to execute command \"%s\": %s\n", cmd.String(), err.Error()) fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command exit code: \"%d\"\n", cmd.ProcessState.ExitCode()) - data, _ := ioutil.ReadAll(cmdStderr) + data, _ := io.ReadAll(cmdStderr) fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command stderr: \"%s\"\n", string(data)) - data, _ = ioutil.ReadAll(cmdStdout) + data, _ = io.ReadAll(cmdStdout) fmt.Fprintf(os.Stderr, "BeegfsStorageCollector.Read(): command stdout: \"%s\"\n", string(data)) return } diff --git a/collectors/collectorManager.go b/collectors/collectorManager.go index 63d0cb4..ea648ef 100644 --- a/collectors/collectorManager.go +++ b/collectors/collectorManager.go @@ -6,9 +6,9 @@ import ( "sync" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" - mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker" ) // Map of all available metric collectors @@ -37,7 +37,8 @@ var AvailableCollectors = map[string]MetricCollector{ "beegfs_meta": new(BeegfsMetaCollector), "beegfs_storage": new(BeegfsStorageCollector), "rocm_smi": new(RocmSmiCollector), - "schedstat": new(SchedstatCollector), + "self": new(SelfCollector), + "schedstat": new(SchedstatCollector), } // Metric collector manager data structure diff --git a/collectors/cpufreqCpuinfoMetric.go b/collectors/cpufreqCpuinfoMetric.go index 80732ff..a6716d3 100644 --- a/collectors/cpufreqCpuinfoMetric.go +++ b/collectors/cpufreqCpuinfoMetric.go @@ -10,8 +10,8 @@ import ( "strings" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) // diff --git a/collectors/cpufreqMetric.go b/collectors/cpufreqMetric.go index e6a0081..cf67457 100644 --- a/collectors/cpufreqMetric.go +++ b/collectors/cpufreqMetric.go @@ -3,14 +3,14 @@ package collectors import ( "encoding/json" "fmt" - "io/ioutil" + "os" "path/filepath" "strconv" "strings" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" "golang.org/x/sys/unix" ) @@ -88,7 +88,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error { // Read package ID physicalPackageIDFile := filepath.Join(cpuDir, "topology", "physical_package_id") - line, err := ioutil.ReadFile(physicalPackageIDFile) + line, err := os.ReadFile(physicalPackageIDFile) if err != nil { return fmt.Errorf("unable to read physical package ID from file '%s': %v", physicalPackageIDFile, err) } @@ -100,7 +100,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error { // Read core ID coreIDFile := filepath.Join(cpuDir, "topology", "core_id") - line, err = ioutil.ReadFile(coreIDFile) + line, err = os.ReadFile(coreIDFile) if err != nil { return fmt.Errorf("unable to read core ID from file '%s': %v", coreIDFile, err) } @@ -188,7 +188,7 @@ func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMetric) } // Read current frequency - line, err := ioutil.ReadFile(t.scalingCurFreqFile) + line, err := os.ReadFile(t.scalingCurFreqFile) if err != nil { cclog.ComponentError( m.name, diff --git a/collectors/cpustatMetric.go b/collectors/cpustatMetric.go index 3c09b83..bd6ec2f 100644 --- a/collectors/cpustatMetric.go +++ b/collectors/cpustatMetric.go @@ -9,8 +9,8 @@ import ( "strings" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" sysconf "github.com/tklauser/go-sysconf" ) @@ -22,12 +22,12 @@ type CpustatCollectorConfig struct { type CpustatCollector struct { metricCollector - config CpustatCollectorConfig - lastTimestamp time.Time // Store time stamp of last tick to derive values - matches map[string]int - cputags map[string]map[string]string - nodetags map[string]string - olddata map[string]map[string]int64 + config CpustatCollectorConfig + lastTimestamp time.Time // Store time stamp of last tick to derive values + matches map[string]int + cputags map[string]map[string]string + nodetags map[string]string + olddata map[string]map[string]int64 } func (m *CpustatCollector) Init(config json.RawMessage) error { @@ -118,7 +118,7 @@ func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]st } } } - + for name, value := range values { y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": value * 100}, now) if err == nil { diff --git a/collectors/customCmdMetric.go b/collectors/customCmdMetric.go index 492dd48..e150014 100644 --- a/collectors/customCmdMetric.go +++ b/collectors/customCmdMetric.go @@ -3,13 +3,13 @@ package collectors import ( "encoding/json" "errors" - "io/ioutil" "log" + "os" "os/exec" "strings" "time" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" influx "github.com/influxdata/line-protocol" ) @@ -53,7 +53,7 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error { } } for _, f := range m.config.Files { - _, err = ioutil.ReadFile(f) + _, err = os.ReadFile(f) if err == nil { m.files = append(m.files, f) } else { @@ -106,7 +106,7 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMetri } } for _, file := range m.files { - buffer, err := ioutil.ReadFile(file) + buffer, err := os.ReadFile(file) if err != nil { log.Print(err) return diff --git a/collectors/diskstatMetric.go b/collectors/diskstatMetric.go index 69ffe07..d1ec4fc 100644 --- a/collectors/diskstatMetric.go +++ b/collectors/diskstatMetric.go @@ -8,8 +8,8 @@ import ( "syscall" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) // "log" diff --git a/collectors/gpfsMetric.go b/collectors/gpfsMetric.go index ca9affe..b3f9fc4 100644 --- a/collectors/gpfsMetric.go +++ b/collectors/gpfsMetric.go @@ -5,7 +5,7 @@ import ( "bytes" "encoding/json" "fmt" - "io/ioutil" + "io" "log" "os/exec" "os/user" @@ -13,8 +13,8 @@ import ( "strings" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) const DEFAULT_GPFS_CMD = "mmpmon" @@ -118,8 +118,8 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) { cmd.Stderr = cmdStderr err := cmd.Run() if err != nil { - dataStdErr, _ := ioutil.ReadAll(cmdStderr) - dataStdOut, _ := ioutil.ReadAll(cmdStdout) + dataStdErr, _ := io.ReadAll(cmdStderr) + dataStdOut, _ := io.ReadAll(cmdStdout) cclog.ComponentError( m.name, fmt.Sprintf("Read(): Failed to execute command \"%s\": %v\n", cmd.String(), err), diff --git a/collectors/infinibandMetric.go b/collectors/infinibandMetric.go index d6613c5..7560af9 100644 --- a/collectors/infinibandMetric.go +++ b/collectors/infinibandMetric.go @@ -2,11 +2,10 @@ package collectors import ( "fmt" - "io/ioutil" "os" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" "golang.org/x/sys/unix" "encoding/json" @@ -85,7 +84,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error { for _, path := range ibDirs { // Skip, when no LID is assigned - line, err := ioutil.ReadFile(filepath.Join(path, "lid")) + line, err := os.ReadFile(filepath.Join(path, "lid")) if err != nil { continue } @@ -175,7 +174,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr for counterName, counterDef := range info.portCounterFiles { // Read counter file - line, err := ioutil.ReadFile(counterDef.path) + line, err := os.ReadFile(counterDef.path) if err != nil { cclog.ComponentError( m.name, diff --git a/collectors/iostatMetric.go b/collectors/iostatMetric.go index 19b4157..4d1dbd1 100644 --- a/collectors/iostatMetric.go +++ b/collectors/iostatMetric.go @@ -4,8 +4,8 @@ import ( "bufio" "os" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" // "log" "encoding/json" diff --git a/collectors/ipmiMetric.go b/collectors/ipmiMetric.go index 50605ac..32c4c45 100644 --- a/collectors/ipmiMetric.go +++ b/collectors/ipmiMetric.go @@ -10,7 +10,7 @@ import ( "strings" "time" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) const IPMITOOL_PATH = `ipmitool` diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go index f22d486..265d84c 100644 --- a/collectors/likwidMetric.go +++ b/collectors/likwidMetric.go @@ -12,7 +12,6 @@ import ( "encoding/json" "errors" "fmt" - "io/ioutil" "math" "os" "os/signal" @@ -24,10 +23,10 @@ import ( "time" "unsafe" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" - topo "github.com/ClusterCockpit/cc-metric-collector/internal/ccTopology" agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology" "github.com/NVIDIA/go-nvml/pkg/dl" ) @@ -154,7 +153,7 @@ func getBaseFreq() float64 { } var freq float64 = math.NaN() for _, f := range files { - buffer, err := ioutil.ReadFile(f) + buffer, err := os.ReadFile(f) if err == nil { data := strings.Replace(string(buffer), "\n", "", -1) x, err := strconv.ParseInt(data, 0, 64) diff --git a/collectors/loadavgMetric.go b/collectors/loadavgMetric.go index 58fb102..887e63e 100644 --- a/collectors/loadavgMetric.go +++ b/collectors/loadavgMetric.go @@ -3,13 +3,13 @@ package collectors import ( "encoding/json" "fmt" - "io/ioutil" + "os" "strconv" "strings" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) // @@ -72,7 +72,7 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMetric) if !m.init { return } - buffer, err := ioutil.ReadFile(LOADAVGFILE) + buffer, err := os.ReadFile(LOADAVGFILE) if err != nil { if err != nil { cclog.ComponentError( diff --git a/collectors/lustreMetric.go b/collectors/lustreMetric.go index eade2ca..bcb9ca6 100644 --- a/collectors/lustreMetric.go +++ b/collectors/lustreMetric.go @@ -10,8 +10,8 @@ import ( "strings" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) const LUSTRE_SYSFS = `/sys/fs/lustre` diff --git a/collectors/memstatMetric.go b/collectors/memstatMetric.go index 9841a01..4aec4c8 100644 --- a/collectors/memstatMetric.go +++ b/collectors/memstatMetric.go @@ -12,8 +12,8 @@ import ( "strings" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) const MEMSTATFILE = "/proc/meminfo" @@ -68,7 +68,8 @@ func getStats(filename string) map[string]MemstatStats { } else if len(linefields) == 5 { v, err := strconv.ParseFloat(linefields[3], 64) if err == nil { - stats[strings.Trim(linefields[0], ":")] = MemstatStats{ + cclog.ComponentDebug("getStats", strings.Trim(linefields[2], ":"), v, linefields[4]) + stats[strings.Trim(linefields[2], ":")] = MemstatStats{ value: v, unit: linefields[4], } @@ -160,7 +161,6 @@ func (m *MemstatCollector) Init(config json.RawMessage) error { func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric) { if !m.init { - cclog.ComponentPrint(m.name, "Here") return } @@ -188,16 +188,20 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric) unit := "" if totalVal, total := stats["MemTotal"]; total { if freeVal, free := stats["MemFree"]; free { + memUsed = totalVal.value - freeVal.value + if len(totalVal.unit) > 0 { + unit = totalVal.unit + } else if len(freeVal.unit) > 0 { + unit = freeVal.unit + } if bufVal, buffers := stats["Buffers"]; buffers { + memUsed -= bufVal.value + if len(bufVal.unit) > 0 && len(unit) == 0 { + unit = bufVal.unit + } if cacheVal, cached := stats["Cached"]; cached { - memUsed = totalVal.value - (freeVal.value + bufVal.value + cacheVal.value) - if len(totalVal.unit) > 0 { - unit = totalVal.unit - } else if len(freeVal.unit) > 0 { - unit = freeVal.unit - } else if len(bufVal.unit) > 0 { - unit = bufVal.unit - } else if len(cacheVal.unit) > 0 { + memUsed -= cacheVal.value + if len(cacheVal.unit) > 0 && len(unit) == 0 { unit = cacheVal.unit } } diff --git a/collectors/metricCollector.go b/collectors/metricCollector.go index 4d52571..f09fa61 100644 --- a/collectors/metricCollector.go +++ b/collectors/metricCollector.go @@ -5,7 +5,7 @@ import ( "fmt" "time" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) type MetricCollector interface { diff --git a/collectors/netstatMetric.go b/collectors/netstatMetric.go index 8cfb34e..8428df1 100644 --- a/collectors/netstatMetric.go +++ b/collectors/netstatMetric.go @@ -9,8 +9,8 @@ import ( "strings" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) const NETSTATFILE = "/proc/net/dev" diff --git a/collectors/nfsMetric.go b/collectors/nfsMetric.go index 6b15784..7dca096 100644 --- a/collectors/nfsMetric.go +++ b/collectors/nfsMetric.go @@ -11,7 +11,7 @@ import ( "strings" "time" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) // First part contains the code for the general NfsCollector. diff --git a/collectors/numastatsMetric.go b/collectors/numastatsMetric.go index f65a019..8eaac67 100644 --- a/collectors/numastatsMetric.go +++ b/collectors/numastatsMetric.go @@ -10,8 +10,8 @@ import ( "strings" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) // diff --git a/collectors/nvidiaMetric.go b/collectors/nvidiaMetric.go index 0eb7c8a..a1fed03 100644 --- a/collectors/nvidiaMetric.go +++ b/collectors/nvidiaMetric.go @@ -8,8 +8,8 @@ import ( "strings" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" "github.com/NVIDIA/go-nvml/pkg/nvml" ) diff --git a/collectors/rocmsmiMetric.go b/collectors/rocmsmiMetric.go index c717a5d..9d8625d 100644 --- a/collectors/rocmsmiMetric.go +++ b/collectors/rocmsmiMetric.go @@ -6,8 +6,8 @@ import ( "fmt" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" "github.com/ClusterCockpit/go-rocm-smi/pkg/rocm_smi" ) @@ -66,14 +66,14 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error { ret := rocm_smi.Init() if ret != rocm_smi.STATUS_SUCCESS { - err = errors.New("Failed to initialize ROCm SMI library") + err = errors.New("failed to initialize ROCm SMI library") cclog.ComponentError(m.name, err.Error()) return err } numDevs, ret := rocm_smi.NumMonitorDevices() if ret != rocm_smi.STATUS_SUCCESS { - err = errors.New("Failed to get number of GPUs from ROCm SMI library") + err = errors.New("failed to get number of GPUs from ROCm SMI library") cclog.ComponentError(m.name, err.Error()) return err } @@ -98,14 +98,14 @@ func (m *RocmSmiCollector) Init(config json.RawMessage) error { } device, ret := rocm_smi.DeviceGetHandleByIndex(i) if ret != rocm_smi.STATUS_SUCCESS { - err = fmt.Errorf("Failed to get handle for GPU %d", i) + err = fmt.Errorf("failed to get handle for GPU %d", i) cclog.ComponentError(m.name, err.Error()) return err } pciInfo, ret := rocm_smi.DeviceGetPciInfo(device) if ret != rocm_smi.STATUS_SUCCESS { - err = fmt.Errorf("Failed to get PCI information for GPU %d", i) + err = fmt.Errorf("failed to get PCI information for GPU %d", i) cclog.ComponentError(m.name, err.Error()) return err } diff --git a/collectors/sampleMetric.go b/collectors/sampleMetric.go index 47ec296..056ed85 100644 --- a/collectors/sampleMetric.go +++ b/collectors/sampleMetric.go @@ -4,8 +4,8 @@ import ( "encoding/json" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) // These are the fields we read from the JSON configuration diff --git a/collectors/sampleTimerMetric.go b/collectors/sampleTimerMetric.go index aa6807e..8b09bc1 100644 --- a/collectors/sampleTimerMetric.go +++ b/collectors/sampleTimerMetric.go @@ -5,8 +5,8 @@ import ( "sync" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) // These are the fields we read from the JSON configuration diff --git a/collectors/schedstatMetric.go b/collectors/schedstatMetric.go index e3041ae..8c010ed 100644 --- a/collectors/schedstatMetric.go +++ b/collectors/schedstatMetric.go @@ -1,17 +1,17 @@ package collectors import ( + "bufio" "encoding/json" "fmt" - "bufio" - "time" - "os" - "strings" - "strconv" "math" + "os" + "strconv" + "strings" + "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) const SCHEDSTATFILE = `/proc/schedstat` @@ -25,11 +25,11 @@ type SchedstatCollectorConfig struct { // defined by metricCollector (name, init, ...) type SchedstatCollector struct { metricCollector - config SchedstatCollectorConfig // the configuration structure - lastTimestamp time.Time // Store time stamp of last tick to derive values - meta map[string]string // default meta information - cputags map[string]map[string]string // default tags - olddata map[string]map[string]int64 // default tags + config SchedstatCollectorConfig // the configuration structure + lastTimestamp time.Time // Store time stamp of last tick to derive values + meta map[string]string // default meta information + cputags map[string]map[string]string // default tags + olddata map[string]map[string]int64 // default tags } // Functions to implement MetricCollector interface @@ -52,7 +52,7 @@ func (m *SchedstatCollector) Init(config json.RawMessage) error { // Define meta information sent with each metric // (Can also be dynamic or this is the basic set with extension through AddMeta()) m.meta = map[string]string{"source": m.name, "group": "SCHEDSTAT"} - + // Read in the JSON configuration if len(config) > 0 { err = json.Unmarshal(config, &m.config) @@ -83,12 +83,11 @@ func (m *SchedstatCollector) Init(config json.RawMessage) error { running, _ := strconv.ParseInt(linefields[7], 10, 64) waiting, _ := strconv.ParseInt(linefields[8], 10, 64) m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)} - m.olddata[linefields[0]] = map[string]int64{"running" : running, "waiting" : waiting} + m.olddata[linefields[0]] = map[string]int64{"running": running, "waiting": waiting} num_cpus++ } } - // Save current timestamp m.lastTimestamp = time.Now() @@ -102,7 +101,7 @@ func (m *SchedstatCollector) ParseProcLine(linefields []string, tags map[string] waiting, _ := strconv.ParseInt(linefields[8], 10, 64) diff_running := running - m.olddata[linefields[0]]["running"] diff_waiting := waiting - m.olddata[linefields[0]]["waiting"] - + var l_running float64 = float64(diff_running) / tsdelta.Seconds() / (math.Pow(1000, 3)) var l_waiting float64 = float64(diff_waiting) / tsdelta.Seconds() / (math.Pow(1000, 3)) @@ -110,11 +109,11 @@ func (m *SchedstatCollector) ParseProcLine(linefields []string, tags map[string] m.olddata[linefields[0]]["waiting"] = waiting value := l_running + l_waiting - y, err := lp.New("cpu_load_core", tags, m.meta, map[string]interface{}{"value": value}, now) + y, err := lp.New("cpu_load_core", tags, m.meta, map[string]interface{}{"value": value}, now) if err == nil { // Send it to output channel output <- y - } + } } // Read collects all metrics belonging to the sample collector diff --git a/collectors/selfMetric.go b/collectors/selfMetric.go new file mode 100644 index 0000000..4fc95c0 --- /dev/null +++ b/collectors/selfMetric.go @@ -0,0 +1,144 @@ +package collectors + +import ( + "encoding/json" + "runtime" + "syscall" + "time" + + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" +) + +type SelfCollectorConfig struct { + MemStats bool `json:"read_mem_stats"` + GoRoutines bool `json:"read_goroutines"` + CgoCalls bool `json:"read_cgo_calls"` + Rusage bool `json:"read_rusage"` +} + +type SelfCollector struct { + metricCollector + config SelfCollectorConfig // the configuration structure + meta map[string]string // default meta information + tags map[string]string // default tags +} + +func (m *SelfCollector) Init(config json.RawMessage) error { + var err error = nil + m.name = "SelfCollector" + m.setup() + m.parallel = true + m.meta = map[string]string{"source": m.name, "group": "Self"} + m.tags = map[string]string{"type": "node"} + if len(config) > 0 { + err = json.Unmarshal(config, &m.config) + if err != nil { + cclog.ComponentError(m.name, "Error reading config:", err.Error()) + return err + } + } + m.init = true + return err +} + +func (m *SelfCollector) Read(interval time.Duration, output chan lp.CCMetric) { + timestamp := time.Now() + + if m.config.MemStats { + var memstats runtime.MemStats + runtime.ReadMemStats(&memstats) + + y, err := lp.New("total_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.TotalAlloc}, timestamp) + if err == nil { + y.AddMeta("unit", "Bytes") + output <- y + } + y, err = lp.New("heap_alloc", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapAlloc}, timestamp) + if err == nil { + y.AddMeta("unit", "Bytes") + output <- y + } + y, err = lp.New("heap_sys", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapSys}, timestamp) + if err == nil { + y.AddMeta("unit", "Bytes") + output <- y + } + y, err = lp.New("heap_idle", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapIdle}, timestamp) + if err == nil { + y.AddMeta("unit", "Bytes") + output <- y + } + y, err = lp.New("heap_inuse", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapInuse}, timestamp) + if err == nil { + y.AddMeta("unit", "Bytes") + output <- y + } + y, err = lp.New("heap_released", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapReleased}, timestamp) + if err == nil { + y.AddMeta("unit", "Bytes") + output <- y + } + y, err = lp.New("heap_objects", m.tags, m.meta, map[string]interface{}{"value": memstats.HeapObjects}, timestamp) + if err == nil { + output <- y + } + } + if m.config.GoRoutines { + y, err := lp.New("num_goroutines", m.tags, m.meta, map[string]interface{}{"value": runtime.NumGoroutine()}, timestamp) + if err == nil { + output <- y + } + } + if m.config.CgoCalls { + y, err := lp.New("num_cgo_calls", m.tags, m.meta, map[string]interface{}{"value": runtime.NumCgoCall()}, timestamp) + if err == nil { + output <- y + } + } + if m.config.Rusage { + var rusage syscall.Rusage + err := syscall.Getrusage(syscall.RUSAGE_SELF, &rusage) + if err == nil { + sec, nsec := rusage.Utime.Unix() + t := float64(sec) + (float64(nsec) * 1e-9) + y, err := lp.New("rusage_user_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp) + if err == nil { + y.AddMeta("unit", "seconds") + output <- y + } + sec, nsec = rusage.Stime.Unix() + t = float64(sec) + (float64(nsec) * 1e-9) + y, err = lp.New("rusage_system_time", m.tags, m.meta, map[string]interface{}{"value": t}, timestamp) + if err == nil { + y.AddMeta("unit", "seconds") + output <- y + } + y, err = lp.New("rusage_vol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nvcsw}, timestamp) + if err == nil { + output <- y + } + y, err = lp.New("rusage_invol_ctx_switch", m.tags, m.meta, map[string]interface{}{"value": rusage.Nivcsw}, timestamp) + if err == nil { + output <- y + } + y, err = lp.New("rusage_signals", m.tags, m.meta, map[string]interface{}{"value": rusage.Nsignals}, timestamp) + if err == nil { + output <- y + } + y, err = lp.New("rusage_major_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Majflt}, timestamp) + if err == nil { + output <- y + } + y, err = lp.New("rusage_minor_pgfaults", m.tags, m.meta, map[string]interface{}{"value": rusage.Minflt}, timestamp) + if err == nil { + output <- y + } + } + + } +} + +func (m *SelfCollector) Close() { + m.init = false +} diff --git a/collectors/selfMetric.md b/collectors/selfMetric.md new file mode 100644 index 0000000..ab8e50b --- /dev/null +++ b/collectors/selfMetric.md @@ -0,0 +1,34 @@ +## `self` collector + +```json + "self": { + "read_mem_stats" : true, + "read_goroutines" : true, + "read_cgo_calls" : true, + "read_rusage" : true + } +``` + +The `self` collector reads the data from the `runtime` and `syscall` packages, so monitors the execution of the cc-metric-collector itself. + +Metrics: +* If `read_mem_stats == true`: + * `total_alloc`: The metric reports cumulative bytes allocated for heap objects. + * `heap_alloc`: The metric reports bytes of allocated heap objects. + * `heap_sys`: The metric reports bytes of heap memory obtained from the OS. + * `heap_idle`: The metric reports bytes in idle (unused) spans. + * `heap_inuse`: The metric reports bytes in in-use spans. + * `heap_released`: The metric reports bytes of physical memory returned to the OS. + * `heap_objects`: The metric reports the number of allocated heap objects. +* If `read_goroutines == true`: + * `num_goroutines`: The metric reports the number of goroutines that currently exist. +* If `read_cgo_calls == true`: + * `num_cgo_calls`: The metric reports the number of cgo calls made by the current process. +* If `read_rusage == true`: + * `rusage_user_time`: The metric reports the amount of time that this process has been scheduled in user mode. + * `rusage_system_time`: The metric reports the amount of time that this process has been scheduled in kernel mode. + * `rusage_vol_ctx_switch`: The metric reports the amount of voluntary context switches. + * `rusage_invol_ctx_switch`: The metric reports the amount of involuntary context switches. + * `rusage_signals`: The metric reports the number of signals received. + * `rusage_major_pgfaults`: The metric reports the number of major faults the process has made which have required loading a memory page from disk. + * `rusage_minor_pgfaults`: The metric reports the number of minor faults the process has made which have not required loading a memory page from disk. diff --git a/collectors/tempMetric.go b/collectors/tempMetric.go index af9d7fd..303be4a 100644 --- a/collectors/tempMetric.go +++ b/collectors/tempMetric.go @@ -3,14 +3,14 @@ package collectors import ( "encoding/json" "fmt" - "io/ioutil" + "os" "path/filepath" "strconv" "strings" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) // See: https://www.kernel.org/doc/html/latest/hwmon/sysfs-interface.html @@ -83,14 +83,14 @@ func (m *TempCollector) Init(config json.RawMessage) error { // sensor name nameFile := filepath.Join(filepath.Dir(file), "name") - name, err := ioutil.ReadFile(nameFile) + name, err := os.ReadFile(nameFile) if err == nil { sensor.name = strings.TrimSpace(string(name)) } // sensor label labelFile := strings.TrimSuffix(file, "_input") + "_label" - label, err := ioutil.ReadFile(labelFile) + label, err := os.ReadFile(labelFile) if err == nil { sensor.label = strings.TrimSpace(string(label)) } @@ -117,7 +117,7 @@ func (m *TempCollector) Init(config json.RawMessage) error { } // Sensor file - _, err = ioutil.ReadFile(file) + _, err = os.ReadFile(file) if err != nil { continue } @@ -139,7 +139,7 @@ func (m *TempCollector) Init(config json.RawMessage) error { // max temperature if m.config.ReportMaxTemp { maxTempFile := strings.TrimSuffix(file, "_input") + "_max" - if buffer, err := ioutil.ReadFile(maxTempFile); err == nil { + if buffer, err := os.ReadFile(maxTempFile); err == nil { if x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64); err == nil { sensor.maxTempName = strings.Replace(sensor.metricName, "temp", "max_temp", 1) sensor.maxTemp = x / 1000 @@ -150,7 +150,7 @@ func (m *TempCollector) Init(config json.RawMessage) error { // critical temperature if m.config.ReportCriticalTemp { criticalTempFile := strings.TrimSuffix(file, "_input") + "_crit" - if buffer, err := ioutil.ReadFile(criticalTempFile); err == nil { + if buffer, err := os.ReadFile(criticalTempFile); err == nil { if x, err := strconv.ParseInt(strings.TrimSpace(string(buffer)), 10, 64); err == nil { sensor.critTempName = strings.Replace(sensor.metricName, "temp", "crit_temp", 1) sensor.critTemp = x / 1000 @@ -175,7 +175,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMetric) { for _, sensor := range m.sensors { // Read sensor file - buffer, err := ioutil.ReadFile(sensor.file) + buffer, err := os.ReadFile(sensor.file) if err != nil { cclog.ComponentError( m.name, diff --git a/collectors/topprocsMetric.go b/collectors/topprocsMetric.go index 1f4aaca..08dbae0 100644 --- a/collectors/topprocsMetric.go +++ b/collectors/topprocsMetric.go @@ -9,7 +9,7 @@ import ( "strings" "time" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) const MAX_NUM_PROCS = 10 diff --git a/go.mod b/go.mod index a42ae8a..30a961c 100644 --- a/go.mod +++ b/go.mod @@ -14,6 +14,7 @@ require ( github.com/nats-io/nats.go v1.16.0 github.com/prometheus/client_golang v1.12.2 github.com/stmcginnis/gofish v0.13.0 + github.com/tklauser/go-sysconf v0.3.10 golang.org/x/sys v0.0.0-20220712014510-0a85c31ab51e ) @@ -33,6 +34,7 @@ require ( github.com/prometheus/common v0.37.0 // indirect github.com/prometheus/procfs v0.7.3 // indirect github.com/shopspring/decimal v1.3.1 // indirect + github.com/tklauser/numcpus v0.4.0 // indirect golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d // indirect golang.org/x/net v0.0.0-20220708220712-1185a9018129 // indirect google.golang.org/protobuf v1.28.0 // indirect diff --git a/go.sum b/go.sum index 9f65f7e..03d7b08 100644 --- a/go.sum +++ b/go.sum @@ -287,6 +287,10 @@ github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/ github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1 h1:5TQK59W5E3v0r2duFAb7P95B6hEeOyEnHRa8MjYSMTY= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/tklauser/go-sysconf v0.3.10 h1:IJ1AZGZRWbY8T5Vfk04D9WOA5WSejdflXxP03OUqALw= +github.com/tklauser/go-sysconf v0.3.10/go.mod h1:C8XykCvCb+Gn0oNCWPIlcb0RuglQTYaQ2hGm7jmxEFk= +github.com/tklauser/numcpus v0.4.0 h1:E53Dm1HjH1/R2/aoCtXtPgzmElmn51aOkhCFSuZq//o= +github.com/tklauser/numcpus v0.4.0/go.mod h1:1+UI3pD8NW14VMwdgJNJ1ESk2UnwhAnz5hMwiKKqXCQ= github.com/ugorji/go v1.1.7/go.mod h1:kZn38zHttfInRq0xu/PH0az30d+z6vm202qpg1oXVMw= github.com/ugorji/go v1.2.7/go.mod h1:nF9osbDWLy6bDVv/Rtoh6QgnvNDpmCalQV5urGCCS6M= github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLYF3GoBXY= @@ -445,6 +449,7 @@ golang.org/x/sys v0.0.0-20211019181941-9d821ace8654/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20211103235746-7861aae1554b/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220114195835-da31bd327af9/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220128215802-99c3d69c2c27/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220513210249-45d2b4557a2a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220712014510-0a85c31ab51e h1:NHvCuwuS43lGnYhten69ZWqi2QOj/CiDNcKbVqwVoew= golang.org/x/sys v0.0.0-20220712014510-0a85c31ab51e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= diff --git a/internal/metricAggregator/metricAggregator.go b/internal/metricAggregator/metricAggregator.go index f5c7ada..50bc963 100644 --- a/internal/metricAggregator/metricAggregator.go +++ b/internal/metricAggregator/metricAggregator.go @@ -9,10 +9,10 @@ import ( "sync" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" - topo "github.com/ClusterCockpit/cc-metric-collector/internal/ccTopology" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology" "github.com/PaesslerAG/gval" ) diff --git a/internal/metricAggregator/metricAggregatorFunctions.go b/internal/metricAggregator/metricAggregatorFunctions.go index 945dc6d..68d205b 100644 --- a/internal/metricAggregator/metricAggregatorFunctions.go +++ b/internal/metricAggregator/metricAggregatorFunctions.go @@ -8,8 +8,8 @@ import ( "sort" "strings" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - topo "github.com/ClusterCockpit/cc-metric-collector/internal/ccTopology" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + topo "github.com/ClusterCockpit/cc-metric-collector/pkg/ccTopology" ) /* diff --git a/internal/metricRouter/metricCache.go b/internal/metricRouter/metricCache.go index 8886f47..81e69a9 100644 --- a/internal/metricRouter/metricCache.go +++ b/internal/metricRouter/metricCache.go @@ -4,11 +4,11 @@ import ( "sync" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator" - mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker" ) type metricCachePeriod struct { diff --git a/internal/metricRouter/metricRouter.go b/internal/metricRouter/metricRouter.go index 2614ced..32ac0f3 100644 --- a/internal/metricRouter/metricRouter.go +++ b/internal/metricRouter/metricRouter.go @@ -7,11 +7,11 @@ import ( "sync" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator" - mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" + mct "github.com/ClusterCockpit/cc-metric-collector/pkg/multiChanTicker" units "github.com/ClusterCockpit/cc-units" ) diff --git a/internal/ccLogger/cclogger.go b/pkg/ccLogger/cclogger.go similarity index 100% rename from internal/ccLogger/cclogger.go rename to pkg/ccLogger/cclogger.go diff --git a/internal/ccMetric/README.md b/pkg/ccMetric/README.md similarity index 100% rename from internal/ccMetric/README.md rename to pkg/ccMetric/README.md diff --git a/internal/ccMetric/ccMetric.go b/pkg/ccMetric/ccMetric.go similarity index 100% rename from internal/ccMetric/ccMetric.go rename to pkg/ccMetric/ccMetric.go diff --git a/internal/ccTopology/ccTopology.go b/pkg/ccTopology/ccTopology.go similarity index 99% rename from internal/ccTopology/ccTopology.go rename to pkg/ccTopology/ccTopology.go index 0ed8883..f7d7092 100644 --- a/internal/ccTopology/ccTopology.go +++ b/pkg/ccTopology/ccTopology.go @@ -10,7 +10,7 @@ import ( "strconv" "strings" - cclogger "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" + cclogger "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" ) const SYSFS_NUMABASE = `/sys/devices/system/node` diff --git a/internal/multiChanTicker/README.md b/pkg/multiChanTicker/README.md similarity index 100% rename from internal/multiChanTicker/README.md rename to pkg/multiChanTicker/README.md diff --git a/internal/multiChanTicker/multiChanTicker.go b/pkg/multiChanTicker/multiChanTicker.go similarity index 93% rename from internal/multiChanTicker/multiChanTicker.go rename to pkg/multiChanTicker/multiChanTicker.go index e0eca43..1791474 100644 --- a/internal/multiChanTicker/multiChanTicker.go +++ b/pkg/multiChanTicker/multiChanTicker.go @@ -3,7 +3,7 @@ package multiChanTicker import ( "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" ) type multiChanTicker struct { diff --git a/receivers/httpReceiver.go b/receivers/httpReceiver.go index 974bcd6..efd31ac 100644 --- a/receivers/httpReceiver.go +++ b/receivers/httpReceiver.go @@ -10,8 +10,8 @@ import ( "strings" "sync" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" "github.com/gorilla/mux" influx "github.com/influxdata/line-protocol" ) diff --git a/receivers/metricReceiver.go b/receivers/metricReceiver.go index 6b85fd4..1edef4e 100644 --- a/receivers/metricReceiver.go +++ b/receivers/metricReceiver.go @@ -1,7 +1,7 @@ package receivers import ( - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) type defaultReceiverConfig struct { diff --git a/receivers/natsReceiver.go b/receivers/natsReceiver.go index 1a5f47b..095a7ee 100644 --- a/receivers/natsReceiver.go +++ b/receivers/natsReceiver.go @@ -6,8 +6,8 @@ import ( "fmt" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" influx "github.com/influxdata/line-protocol" nats "github.com/nats-io/nats.go" ) diff --git a/receivers/prometheusReceiver.go b/receivers/prometheusReceiver.go index c22969d..7846c1d 100644 --- a/receivers/prometheusReceiver.go +++ b/receivers/prometheusReceiver.go @@ -12,8 +12,8 @@ import ( "sync" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) type PrometheusReceiverConfig struct { diff --git a/receivers/receiveManager.go b/receivers/receiveManager.go index 7a20fac..c47c3cc 100644 --- a/receivers/receiveManager.go +++ b/receivers/receiveManager.go @@ -5,8 +5,8 @@ import ( "os" "sync" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) var AvailableReceivers = map[string]func(name string, config json.RawMessage) (Receiver, error){ diff --git a/receivers/redfishReceiver.go b/receivers/redfishReceiver.go index 18c3ee8..de9744d 100644 --- a/receivers/redfishReceiver.go +++ b/receivers/redfishReceiver.go @@ -10,8 +10,8 @@ import ( "sync" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" // See: https://pkg.go.dev/github.com/stmcginnis/gofish "github.com/stmcginnis/gofish" diff --git a/receivers/sampleReceiver.go b/receivers/sampleReceiver.go index 19d6f25..86e68cd 100644 --- a/receivers/sampleReceiver.go +++ b/receivers/sampleReceiver.go @@ -4,7 +4,7 @@ import ( "encoding/json" "fmt" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" ) // SampleReceiver configuration: receiver type, listen address, port diff --git a/sinks/gangliaCommon.go b/sinks/gangliaCommon.go index f92550b..1c846f0 100644 --- a/sinks/gangliaCommon.go +++ b/sinks/gangliaCommon.go @@ -4,7 +4,7 @@ import ( "fmt" "strings" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) func GangliaMetricName(point lp.CCMetric) string { diff --git a/sinks/gangliaSink.go b/sinks/gangliaSink.go index 5324123..b6a0646 100644 --- a/sinks/gangliaSink.go +++ b/sinks/gangliaSink.go @@ -9,8 +9,8 @@ import ( // "time" "os/exec" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) const GMETRIC_EXEC = `gmetric` diff --git a/sinks/httpSink.go b/sinks/httpSink.go index 466915d..49d98b0 100644 --- a/sinks/httpSink.go +++ b/sinks/httpSink.go @@ -9,8 +9,8 @@ import ( "sync" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" influx "github.com/influxdata/line-protocol" ) diff --git a/sinks/influxAsyncSink.go b/sinks/influxAsyncSink.go index bf88079..ccf5302 100644 --- a/sinks/influxAsyncSink.go +++ b/sinks/influxAsyncSink.go @@ -9,8 +9,8 @@ import ( "strings" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" influxdb2 "github.com/influxdata/influxdb-client-go/v2" influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api" influxdb2ApiHttp "github.com/influxdata/influxdb-client-go/v2/api/http" diff --git a/sinks/influxSink.go b/sinks/influxSink.go index 90f8da8..94704cd 100644 --- a/sinks/influxSink.go +++ b/sinks/influxSink.go @@ -9,8 +9,8 @@ import ( "sync" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" influxdb2 "github.com/influxdata/influxdb-client-go/v2" influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api" "github.com/influxdata/influxdb-client-go/v2/api/write" diff --git a/sinks/libgangliaSink.go b/sinks/libgangliaSink.go index 3651584..bb5488a 100644 --- a/sinks/libgangliaSink.go +++ b/sinks/libgangliaSink.go @@ -71,8 +71,8 @@ import ( "fmt" "unsafe" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" "github.com/NVIDIA/go-nvml/pkg/dl" ) diff --git a/sinks/metricSink.go b/sinks/metricSink.go index c6c6860..2fd429c 100644 --- a/sinks/metricSink.go +++ b/sinks/metricSink.go @@ -1,7 +1,7 @@ package sinks import ( - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) type defaultSinkConfig struct { diff --git a/sinks/natsSink.go b/sinks/natsSink.go index 4d43454..a9a980d 100644 --- a/sinks/natsSink.go +++ b/sinks/natsSink.go @@ -8,8 +8,8 @@ import ( "sync" "time" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" influx "github.com/influxdata/line-protocol" nats "github.com/nats-io/nats.go" ) diff --git a/sinks/prometheusSink.go b/sinks/prometheusSink.go index 5011ac0..7a1163d 100644 --- a/sinks/prometheusSink.go +++ b/sinks/prometheusSink.go @@ -9,8 +9,8 @@ import ( "strings" "sync" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" "github.com/gorilla/mux" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" diff --git a/sinks/sampleSink.go b/sinks/sampleSink.go index 2a823e6..9d9e991 100644 --- a/sinks/sampleSink.go +++ b/sinks/sampleSink.go @@ -5,8 +5,8 @@ import ( "fmt" "log" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) type SampleSinkConfig struct { diff --git a/sinks/sinkManager.go b/sinks/sinkManager.go index 6af8614..392346d 100644 --- a/sinks/sinkManager.go +++ b/sinks/sinkManager.go @@ -6,8 +6,8 @@ import ( "os" "sync" - cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + cclog "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) const SINK_MAX_FORWARD = 50 diff --git a/sinks/stdoutSink.go b/sinks/stdoutSink.go index e091af3..eadef4e 100644 --- a/sinks/stdoutSink.go +++ b/sinks/stdoutSink.go @@ -7,7 +7,7 @@ import ( "strings" // "time" - lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric" + lp "github.com/ClusterCockpit/cc-metric-collector/pkg/ccMetric" ) type StdoutSink struct { From 821d10465654e4d5afd77a559fd33de846ace29c Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 13 Oct 2022 16:42:04 +0200 Subject: [PATCH 06/10] Try fixing DEB package --- Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Makefile b/Makefile index 0f9cb45..50b92fc 100644 --- a/Makefile +++ b/Makefile @@ -121,8 +121,14 @@ DEB: scripts/cc-metric-collector.deb.control $(APP) @SIZE_BYTES=$$(du -bcs --exclude=.dpkgbuild "$$WORKSPACE"/ | awk '{print $$1}' | head -1 | sed -e 's/^0\+//') @SIZE="$$(awk -v size="$$SIZE_BYTES" 'BEGIN {print (size/1024)+1}' | awk '{print int($$0)}')" #@sed -e s+"{VERSION}"+"$$VERS"+g -e s+"{INSTALLED_SIZE}"+"$$SIZE"+g -e s+"{ARCH}"+"$$ARCH"+g $$CONTROLFILE > $${DEBIANDIR}/control + @echo "Version: $$VERS" + @echo "Size: $$SIZE" + @echo "Arch: $$ARCH" @sed -e s+"{VERSION}"+"$$VERS"+g -e s+"{INSTALLED_SIZE}"+"$$SIZE"+g -e s+"{ARCH}"+"$$ARCH"+g $$CONTROLFILE > $${DEBIANBINDIR}/control @make PREFIX=$${WORKSPACE} install @DEB_FILE="cc-metric-collector_$${VERS}_$${ARCH}.deb" @dpkg-deb -b $${WORKSPACE} "$$DEB_FILE" + @if [[ "$${GITHUB_ACTIONS}" == true ]]; then + @ echo "::set-output name=DEB::$${DEB_FILE}" + @fi @rm -r "$${WORKSPACE}" From 317d36c9dd1a6af58db85f4d8545fbe577134be6 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 13 Oct 2022 16:46:54 +0200 Subject: [PATCH 07/10] Try fixing DEB package --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index 50b92fc..2f49354 100644 --- a/Makefile +++ b/Makefile @@ -112,7 +112,9 @@ DEB: scripts/cc-metric-collector.deb.control $(APP) #@mkdir --parents --verbose $$DEBIANDIR @CONTROLFILE="$${BASEDIR}/scripts/cc-metric-collector.deb.control" @COMMITISH="HEAD" + @git describe --tags --abbrev=0 $${COMMITISH} @VERS=$$(git describe --tags --abbrev=0 $${COMMITISH}) + @if [ -z "$$VERS" ]; then VERS=${GITHUB_REF_NAME}; fi @VERS=$${VERS#v} @VERS=$$(echo $$VERS | sed -e s+'-'+'_'+g) @ARCH=$$(uname -m) From e79601e2e8489c389693b858b64d2dd61e75ff11 Mon Sep 17 00:00:00 2001 From: Thomas Roehl Date: Thu, 13 Oct 2022 16:49:58 +0200 Subject: [PATCH 08/10] Try fixing DEB package --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 2f49354..d63176d 100644 --- a/Makefile +++ b/Makefile @@ -130,7 +130,7 @@ DEB: scripts/cc-metric-collector.deb.control $(APP) @make PREFIX=$${WORKSPACE} install @DEB_FILE="cc-metric-collector_$${VERS}_$${ARCH}.deb" @dpkg-deb -b $${WORKSPACE} "$$DEB_FILE" - @if [[ "$${GITHUB_ACTIONS}" == true ]]; then + @if [ "$${GITHUB_ACTIONS}" = "true" ]; then @ echo "::set-output name=DEB::$${DEB_FILE}" @fi @rm -r "$${WORKSPACE}" From 0f35469168ac16fdfada59c23fb11861c61595bc Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Fri, 4 Nov 2022 14:52:05 +0100 Subject: [PATCH 09/10] Update httpSink.md --- sinks/httpSink.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sinks/httpSink.md b/sinks/httpSink.md index 23203a2..7251ff2 100644 --- a/sinks/httpSink.md +++ b/sinks/httpSink.md @@ -8,7 +8,9 @@ The `http` sink uses POST requests to a HTTP server to submit the metrics in the { "": { "type": "http", - "meta_as_tags" : true, + "meta_as_tags" : [ + "meta-key" + ], "url" : "https://my-monitoring.example.com:1234/api/write", "jwt" : "blabla.blabla.blabla", "timeout": "5s", @@ -20,7 +22,7 @@ The `http` sink uses POST requests to a HTTP server to submit the metrics in the ``` - `type`: makes the sink an `http` sink -- `meta_as_tags`: print all meta information as tags in the output (optional) +- `meta_as_tags`: Move specific meta information to the tags in the output (optional) - `url`: The full URL of the endpoint - `jwt`: JSON web tokens for authentification (Using the *Bearer* scheme) - `timeout`: General timeout for the HTTP client (default '5s') From f0da07310b01898f476ca7502aed81f2cd0c236e Mon Sep 17 00:00:00 2001 From: Thomas Gruber Date: Fri, 4 Nov 2022 14:53:08 +0100 Subject: [PATCH 10/10] Update README.md --- sinks/README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sinks/README.md b/sinks/README.md index df532c9..d6c88b8 100644 --- a/sinks/README.md +++ b/sinks/README.md @@ -20,7 +20,9 @@ The configuration file for the sinks is a list of configurations. The `type` fie [ "mystdout" : { "type" : "stdout", - "meta_as_tags" : false + "meta_as_tags" : [ + "unit" + ] }, "metricstore" : { "type" : "http", @@ -103,4 +105,4 @@ func NewSampleSink(name string, config json.RawMessage) (Sink, error) { return s, err } -``` \ No newline at end of file +```