mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2024-11-10 04:27:25 +01:00
8d85bd53f1
* Cleanup: Remove unused code * Use Golang duration parser for 'interval' and 'duration' in main config * Update handling of LIKWID headers. Download only if not already present in the system. Fixes #73 * Units with cc-units (#64) * Add option to normalize units with cc-unit * Add unit conversion to router * Add option to change unit prefix in the router * Add to MetricRouter README * Add order of operations in router to README * Use second add_tags/del_tags only if metric gets renamed * Skip disks in DiskstatCollector that have size=0 * Check readability of sensor files in TempCollector * Fix for --once option * Rename `cpu` type to `hwthread` (#69) * Rename 'cpu' type to 'hwthread' to avoid naming clashes with MetricStore and CC-Webfrontend * Collectors in parallel (#74) * Provide info to CollectorManager whether the collector can be executed in parallel with others * Split serial and parallel collectors. Read in parallel first * Update NvidiaCollector with new metrics, MIG and NvLink support (#75) * CC topology module update (#76) * Rename CPU to hardware thread, write some comments * Do renaming in other parts * Remove CpuList and SocketList function from metricCollector. Available in ccTopology * Option to use MIG UUID as subtype-id in NvidiaCollector * Option to use MIG slice name as subtype-id in NvidiaCollector * MetricRouter: Fix JSON in README * Fix for Github Action to really use the selected version * Remove Ganglia installation in runonce Action and add Go 1.18 * Fix daemon options in init script * Add separate go.mod files to use it with deprecated 1.16 * Minor updates for Makefiles * fix string comparison * AMD ROCm SMI collector (#77) * Add collector for AMD ROCm SMI metrics * Fix import path * Fix imports * Remove Board Number * store GPU index explicitly * Remove board number from description * Use http instead of ftp to download likwid * Fix serial number in rocmCollector * Improved http sink (#78) * automatic flush in NatsSink * tweak default options of HttpSink * shorter cirt. section and retries for HttpSink * fix error handling * Remove file added by mistake. * Use http instead of ftp to download likwid * Fix serial number in rocmCollector Co-authored-by: Thomas Roehl <thomas.roehl@fau.de> Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Co-authored-by: Lou <lou.knauer@gmx.de>
102 lines
3.8 KiB
Go
102 lines
3.8 KiB
Go
package collectors
|
|
|
|
import (
|
|
"encoding/json"
|
|
"time"
|
|
|
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
|
)
|
|
|
|
// These are the fields we read from the JSON configuration
|
|
type SampleCollectorConfig struct {
|
|
Interval string `json:"interval"`
|
|
}
|
|
|
|
// This contains all variables we need during execution and the variables
|
|
// defined by metricCollector (name, init, ...)
|
|
type SampleCollector struct {
|
|
metricCollector
|
|
config SampleTimerCollectorConfig // the configuration structure
|
|
meta map[string]string // default meta information
|
|
tags map[string]string // default tags
|
|
}
|
|
|
|
// Functions to implement MetricCollector interface
|
|
// Init(...), Read(...), Close()
|
|
// See: metricCollector.go
|
|
|
|
// Init initializes the sample collector
|
|
// Called once by the collector manager
|
|
// All tags, meta data tags and metrics that do not change over the runtime should be set here
|
|
func (m *SampleCollector) Init(config json.RawMessage) error {
|
|
var err error = nil
|
|
// Always set the name early in Init() to use it in cclog.Component* functions
|
|
m.name = "InternalCollector"
|
|
// This is for later use, also call it early
|
|
m.setup()
|
|
// Tell whether the collector should be run in parallel with others (reading files, ...)
|
|
// or it should be run serially, mostly for collectors acutally doing measurements
|
|
// because they should not measure the execution of the other collectors
|
|
m.parallel = true
|
|
// Define meta information sent with each metric
|
|
// (Can also be dynamic or this is the basic set with extension through AddMeta())
|
|
m.meta = map[string]string{"source": m.name, "group": "SAMPLE"}
|
|
// Define tags sent with each metric
|
|
// The 'type' tag is always needed, it defines the granulatity of the metric
|
|
// node -> whole system
|
|
// socket -> CPU socket (requires socket ID as 'type-id' tag)
|
|
// die -> CPU die (requires CPU die ID as 'type-id' tag)
|
|
// memoryDomain -> NUMA domain (requires NUMA domain ID as 'type-id' tag)
|
|
// llc -> Last level cache (requires last level cache ID as 'type-id' tag)
|
|
// core -> single CPU core that may consist of multiple hardware threads (SMT) (requires core ID as 'type-id' tag)
|
|
// hwthtread -> single CPU hardware thread (requires hardware thread ID as 'type-id' tag)
|
|
// accelerator -> A accelerator device like GPU or FPGA (requires an accelerator ID as 'type-id' tag)
|
|
m.tags = map[string]string{"type": "node"}
|
|
// Read in the JSON configuration
|
|
if len(config) > 0 {
|
|
err = json.Unmarshal(config, &m.config)
|
|
if err != nil {
|
|
cclog.ComponentError(m.name, "Error reading config:", err.Error())
|
|
return err
|
|
}
|
|
}
|
|
|
|
// Set up everything that the collector requires during the Read() execution
|
|
// Check files required, test execution of some commands, create data structure
|
|
// for all topological entities (sockets, NUMA domains, ...)
|
|
// Return some useful error message in case of any failures
|
|
|
|
// Set this flag only if everything is initialized properly, all required files exist, ...
|
|
m.init = true
|
|
return err
|
|
}
|
|
|
|
// Read collects all metrics belonging to the sample collector
|
|
// and sends them through the output channel to the collector manager
|
|
func (m *SampleCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
|
// Create a sample metric
|
|
timestamp := time.Now()
|
|
|
|
value := 1.0
|
|
// If you want to measure something for a specific amount of time, use interval
|
|
// start := readState()
|
|
// time.Sleep(interval)
|
|
// stop := readState()
|
|
// value = (stop - start) / interval.Seconds()
|
|
|
|
y, err := lp.New("sample_metric", m.tags, m.meta, map[string]interface{}{"value": value}, timestamp)
|
|
if err == nil {
|
|
// Send it to output channel
|
|
output <- y
|
|
}
|
|
|
|
}
|
|
|
|
// Close metric collector: close network connection, close files, close libraries, ...
|
|
// Called once by the collector manager
|
|
func (m *SampleCollector) Close() {
|
|
// Unset flag
|
|
m.init = false
|
|
}
|