mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-07-20 20:01:40 +02:00
Compare commits
28 Commits
v0.4
...
http_stats
Author | SHA1 | Date | |
---|---|---|---|
|
9dd6ff1a76 | ||
|
257b4a64b5 | ||
|
5eeb097136 | ||
|
4a4992877c | ||
|
9447685a69 | ||
|
28348bd108 | ||
|
a3b9d8a90b | ||
|
7e43e9171e | ||
|
5d25a7bf12 | ||
|
83b4343310 | ||
|
f1d3cabdc6 | ||
|
2a014b6fba | ||
|
50479f9325 | ||
|
e0e91844bc | ||
|
296225f3a8 | ||
|
43bcce6fb5 | ||
|
622e94ae0e | ||
|
c506114480 | ||
|
657543dded | ||
|
beebcd7145 | ||
|
082eea525a | ||
|
2b8266d1d2 | ||
|
d835724d93 | ||
|
c5082bbffe | ||
|
4c1263312b | ||
|
940623585c | ||
|
87ecb12c6f | ||
|
ae64eddcc8 |
@@ -20,6 +20,7 @@ There is a main configuration file with basic settings that point to the other c
|
|||||||
"collectors" : "collectors.json",
|
"collectors" : "collectors.json",
|
||||||
"receivers" : "receivers.json",
|
"receivers" : "receivers.json",
|
||||||
"router" : "router.json",
|
"router" : "router.json",
|
||||||
|
"stats_api" : "api.json",
|
||||||
"interval": 10,
|
"interval": 10,
|
||||||
"duration": 1
|
"duration": 1
|
||||||
}
|
}
|
||||||
@@ -32,6 +33,7 @@ See the component READMEs for their configuration:
|
|||||||
* [`sinks`](./sinks/README.md)
|
* [`sinks`](./sinks/README.md)
|
||||||
* [`receivers`](./receivers/README.md)
|
* [`receivers`](./receivers/README.md)
|
||||||
* [`router`](./internal/metricRouter/README.md)
|
* [`router`](./internal/metricRouter/README.md)
|
||||||
|
* [`stats_api`](./internal/metricRouter/StatsApi.md)
|
||||||
|
|
||||||
|
|
||||||
# Installation
|
# Installation
|
||||||
|
@@ -28,6 +28,7 @@ type CentralConfigFile struct {
|
|||||||
RouterConfigFile string `json:"router"`
|
RouterConfigFile string `json:"router"`
|
||||||
SinkConfigFile string `json:"sinks"`
|
SinkConfigFile string `json:"sinks"`
|
||||||
ReceiverConfigFile string `json:"receivers,omitempty"`
|
ReceiverConfigFile string `json:"receivers,omitempty"`
|
||||||
|
StatsApiConfigFile string `json:"stats_api,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func LoadCentralConfiguration(file string, config *CentralConfigFile) error {
|
func LoadCentralConfiguration(file string, config *CentralConfigFile) error {
|
||||||
@@ -52,6 +53,7 @@ type RuntimeConfig struct {
|
|||||||
CollectManager collectors.CollectorManager
|
CollectManager collectors.CollectorManager
|
||||||
SinkManager sinks.SinkManager
|
SinkManager sinks.SinkManager
|
||||||
ReceiveManager receivers.ReceiveManager
|
ReceiveManager receivers.ReceiveManager
|
||||||
|
StatsApi mr.StatsApi
|
||||||
MultiChanTicker mct.MultiChanTicker
|
MultiChanTicker mct.MultiChanTicker
|
||||||
|
|
||||||
Channels []chan lp.CCMetric
|
Channels []chan lp.CCMetric
|
||||||
@@ -152,11 +154,16 @@ func shutdownHandler(config *RuntimeConfig, shutdownSignal chan os.Signal) {
|
|||||||
cclog.Debug("Shutdown SinkManager...")
|
cclog.Debug("Shutdown SinkManager...")
|
||||||
config.SinkManager.Close()
|
config.SinkManager.Close()
|
||||||
}
|
}
|
||||||
|
if config.StatsApi != nil {
|
||||||
|
cclog.Debug("Shutdown StatsApi...")
|
||||||
|
config.StatsApi.Close()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func mainFunc() int {
|
func mainFunc() int {
|
||||||
var err error
|
var err error
|
||||||
use_recv := false
|
use_recv := false
|
||||||
|
use_api := false
|
||||||
|
|
||||||
// Initialize runtime configuration
|
// Initialize runtime configuration
|
||||||
rcfg := RuntimeConfig{
|
rcfg := RuntimeConfig{
|
||||||
@@ -164,6 +171,7 @@ func mainFunc() int {
|
|||||||
CollectManager: nil,
|
CollectManager: nil,
|
||||||
SinkManager: nil,
|
SinkManager: nil,
|
||||||
ReceiveManager: nil,
|
ReceiveManager: nil,
|
||||||
|
StatsApi: nil,
|
||||||
CliArgs: ReadCli(),
|
CliArgs: ReadCli(),
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -253,6 +261,16 @@ func mainFunc() int {
|
|||||||
use_recv = true
|
use_recv = true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Create new statistics API manager
|
||||||
|
if len(rcfg.ConfigFile.StatsApiConfigFile) > 0 {
|
||||||
|
rcfg.StatsApi, err = mr.NewStatsApi(rcfg.MultiChanTicker, &rcfg.Sync, rcfg.ConfigFile.StatsApiConfigFile)
|
||||||
|
if err != nil {
|
||||||
|
cclog.Error(err.Error())
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
use_api = true
|
||||||
|
}
|
||||||
|
|
||||||
// Create shutdown handler
|
// Create shutdown handler
|
||||||
shutdownSignal := make(chan os.Signal, 1)
|
shutdownSignal := make(chan os.Signal, 1)
|
||||||
signal.Notify(shutdownSignal, os.Interrupt)
|
signal.Notify(shutdownSignal, os.Interrupt)
|
||||||
@@ -260,6 +278,11 @@ func mainFunc() int {
|
|||||||
rcfg.Sync.Add(1)
|
rcfg.Sync.Add(1)
|
||||||
go shutdownHandler(&rcfg, shutdownSignal)
|
go shutdownHandler(&rcfg, shutdownSignal)
|
||||||
|
|
||||||
|
// Start the stats api early to be prepared for init settings
|
||||||
|
if use_api {
|
||||||
|
rcfg.StatsApi.Start()
|
||||||
|
}
|
||||||
|
|
||||||
// Start the managers
|
// Start the managers
|
||||||
rcfg.MetricRouter.Start()
|
rcfg.MetricRouter.Start()
|
||||||
rcfg.SinkManager.Start()
|
rcfg.SinkManager.Start()
|
||||||
|
@@ -16,6 +16,7 @@ import (
|
|||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
)
|
)
|
||||||
|
|
||||||
const DEFAULT_BEEGFS_CMD = "beegfs-ctl"
|
const DEFAULT_BEEGFS_CMD = "beegfs-ctl"
|
||||||
@@ -29,10 +30,11 @@ type BeegfsMetaCollectorConfig struct {
|
|||||||
|
|
||||||
type BeegfsMetaCollector struct {
|
type BeegfsMetaCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
tags map[string]string
|
tags map[string]string
|
||||||
matches map[string]string
|
matches map[string]string
|
||||||
config BeegfsMetaCollectorConfig
|
config BeegfsMetaCollectorConfig
|
||||||
skipFS map[string]struct{}
|
skipFS map[string]struct{}
|
||||||
|
statsProcessedMetrics int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
|
func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
|
||||||
@@ -105,6 +107,7 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to find beegfs-ctl binary '%s': %v", m.config.Beegfs, err)
|
return fmt.Errorf("BeegfsMetaCollector.Init(): Failed to find beegfs-ctl binary '%s': %v", m.config.Beegfs, err)
|
||||||
}
|
}
|
||||||
|
m.statsProcessedMetrics = 0
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -218,10 +221,12 @@ func (m *BeegfsMetaCollector) Read(interval time.Duration, output chan lp.CCMetr
|
|||||||
y, err := lp.New(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
y, err := lp.New(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
stats.ComponentStatInt(m.name, "processed_metrics", m.statsProcessedMetrics)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *BeegfsMetaCollector) Close() {
|
func (m *BeegfsMetaCollector) Close() {
|
||||||
|
@@ -16,6 +16,7 @@ import (
|
|||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Struct for the collector-specific JSON config
|
// Struct for the collector-specific JSON config
|
||||||
@@ -27,10 +28,11 @@ type BeegfsStorageCollectorConfig struct {
|
|||||||
|
|
||||||
type BeegfsStorageCollector struct {
|
type BeegfsStorageCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
tags map[string]string
|
tags map[string]string
|
||||||
matches map[string]string
|
matches map[string]string
|
||||||
config BeegfsStorageCollectorConfig
|
config BeegfsStorageCollectorConfig
|
||||||
skipFS map[string]struct{}
|
skipFS map[string]struct{}
|
||||||
|
statsProcessedMetrics int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
|
func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
|
||||||
@@ -98,6 +100,7 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to find beegfs-ctl binary '%s': %v", m.config.Beegfs, err)
|
return fmt.Errorf("BeegfsStorageCollector.Init(): Failed to find beegfs-ctl binary '%s': %v", m.config.Beegfs, err)
|
||||||
}
|
}
|
||||||
|
m.statsProcessedMetrics = 0
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -210,10 +213,12 @@ func (m *BeegfsStorageCollector) Read(interval time.Duration, output chan lp.CCM
|
|||||||
y, err := lp.New(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
y, err := lp.New(key, m.tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
stats.ComponentStatInt(m.name, "processed_metrics", m.statsProcessedMetrics)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *BeegfsStorageCollector) Close() {
|
func (m *BeegfsStorageCollector) Close() {
|
||||||
|
@@ -12,6 +12,7 @@ import (
|
|||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
)
|
)
|
||||||
|
|
||||||
//
|
//
|
||||||
@@ -36,7 +37,8 @@ type CPUFreqCpuInfoCollectorTopology struct {
|
|||||||
|
|
||||||
type CPUFreqCpuInfoCollector struct {
|
type CPUFreqCpuInfoCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
topology []*CPUFreqCpuInfoCollectorTopology
|
topology []*CPUFreqCpuInfoCollectorTopology
|
||||||
|
statsProcessedMetrics int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
|
func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
|
||||||
@@ -155,7 +157,7 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
|
|||||||
"package_id": t.physicalPackageID,
|
"package_id": t.physicalPackageID,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
m.statsProcessedMetrics = 0
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -196,6 +198,7 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
if y, err := lp.New("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": value}, now); err == nil {
|
if y, err := lp.New("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": value}, now); err == nil {
|
||||||
|
m.statsProcessedMetrics++
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -203,6 +206,7 @@ func (m *CPUFreqCpuInfoCollector) Read(interval time.Duration, output chan lp.CC
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
stats.ComponentStatInt(m.name, "processed_metrics", m.statsProcessedMetrics)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *CPUFreqCpuInfoCollector) Close() {
|
func (m *CPUFreqCpuInfoCollector) Close() {
|
||||||
|
@@ -11,6 +11,7 @@ import (
|
|||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
"golang.org/x/sys/unix"
|
"golang.org/x/sys/unix"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -39,8 +40,9 @@ type CPUFreqCollectorTopology struct {
|
|||||||
//
|
//
|
||||||
type CPUFreqCollector struct {
|
type CPUFreqCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
topology []CPUFreqCollectorTopology
|
topology []CPUFreqCollectorTopology
|
||||||
config struct {
|
statsProcessedMetrics int64
|
||||||
|
config struct {
|
||||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -166,7 +168,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
|
|||||||
"package_id": t.physicalPackageID,
|
"package_id": t.physicalPackageID,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
m.statsProcessedMetrics = 0
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -203,9 +205,11 @@ func (m *CPUFreqCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if y, err := lp.New("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": cpuFreq}, now); err == nil {
|
if y, err := lp.New("cpufreq", t.tagSet, m.meta, map[string]interface{}{"value": cpuFreq}, now); err == nil {
|
||||||
|
m.statsProcessedMetrics++
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
stats.ComponentStatInt(m.name, "processed_metrics", m.statsProcessedMetrics)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *CPUFreqCollector) Close() {
|
func (m *CPUFreqCollector) Close() {
|
||||||
|
@@ -11,6 +11,7 @@ import (
|
|||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
)
|
)
|
||||||
|
|
||||||
const CPUSTATFILE = `/proc/stat`
|
const CPUSTATFILE = `/proc/stat`
|
||||||
@@ -21,10 +22,11 @@ type CpustatCollectorConfig struct {
|
|||||||
|
|
||||||
type CpustatCollector struct {
|
type CpustatCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
config CpustatCollectorConfig
|
config CpustatCollectorConfig
|
||||||
matches map[string]int
|
matches map[string]int
|
||||||
cputags map[string]map[string]string
|
cputags map[string]map[string]string
|
||||||
nodetags map[string]string
|
nodetags map[string]string
|
||||||
|
statsProcessedMetrics int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *CpustatCollector) Init(config json.RawMessage) error {
|
func (m *CpustatCollector) Init(config json.RawMessage) error {
|
||||||
@@ -86,6 +88,7 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
|
|||||||
num_cpus++
|
num_cpus++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
m.statsProcessedMetrics = 0
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -106,6 +109,7 @@ func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]st
|
|||||||
for name, value := range values {
|
for name, value := range values {
|
||||||
y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": (value * 100.0) / total}, t)
|
y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": (value * 100.0) / total}, t)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
m.statsProcessedMetrics++
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -141,8 +145,10 @@ func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
time.Now(),
|
time.Now(),
|
||||||
)
|
)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
m.statsProcessedMetrics++
|
||||||
output <- num_cpus_metric
|
output <- num_cpus_metric
|
||||||
}
|
}
|
||||||
|
stats.ComponentStatInt(m.name, "processed_metrics", m.statsProcessedMetrics)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *CpustatCollector) Close() {
|
func (m *CpustatCollector) Close() {
|
||||||
|
@@ -10,6 +10,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
influx "github.com/influxdata/line-protocol"
|
influx "github.com/influxdata/line-protocol"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -23,11 +24,14 @@ type CustomCmdCollectorConfig struct {
|
|||||||
|
|
||||||
type CustomCmdCollector struct {
|
type CustomCmdCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
handler *influx.MetricHandler
|
handler *influx.MetricHandler
|
||||||
parser *influx.Parser
|
parser *influx.Parser
|
||||||
config CustomCmdCollectorConfig
|
config CustomCmdCollectorConfig
|
||||||
commands []string
|
commands []string
|
||||||
files []string
|
files []string
|
||||||
|
statsProcessedMetrics int64
|
||||||
|
statsProcessedCommands int64
|
||||||
|
statsProcessedFiles int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *CustomCmdCollector) Init(config json.RawMessage) error {
|
func (m *CustomCmdCollector) Init(config json.RawMessage) error {
|
||||||
@@ -66,6 +70,9 @@ func (m *CustomCmdCollector) Init(config json.RawMessage) error {
|
|||||||
m.handler = influx.NewMetricHandler()
|
m.handler = influx.NewMetricHandler()
|
||||||
m.parser = influx.NewParser(m.handler)
|
m.parser = influx.NewParser(m.handler)
|
||||||
m.parser.SetTimeFunc(DefaultTime)
|
m.parser.SetTimeFunc(DefaultTime)
|
||||||
|
m.statsProcessedMetrics = 0
|
||||||
|
m.statsProcessedFiles = 0
|
||||||
|
m.statsProcessedCommands = 0
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -100,9 +107,13 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMetri
|
|||||||
|
|
||||||
y := lp.FromInfluxMetric(c)
|
y := lp.FromInfluxMetric(c)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
m.statsProcessedMetrics++
|
||||||
|
stats.ComponentStatInt(m.name, "processed_metrics", m.statsProcessedMetrics)
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
m.statsProcessedCommands++
|
||||||
|
stats.ComponentStatInt(m.name, "processed_commands", m.statsProcessedCommands)
|
||||||
}
|
}
|
||||||
for _, file := range m.files {
|
for _, file := range m.files {
|
||||||
buffer, err := ioutil.ReadFile(file)
|
buffer, err := ioutil.ReadFile(file)
|
||||||
@@ -122,9 +133,13 @@ func (m *CustomCmdCollector) Read(interval time.Duration, output chan lp.CCMetri
|
|||||||
}
|
}
|
||||||
y := lp.FromInfluxMetric(f)
|
y := lp.FromInfluxMetric(f)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
m.statsProcessedMetrics++
|
||||||
|
stats.ComponentStatInt(m.name, "processed_metrics", m.statsProcessedMetrics)
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
m.statsProcessedFiles++
|
||||||
|
stats.ComponentStatInt(m.name, "processed_files", m.statsProcessedFiles)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -11,6 +11,7 @@ import (
|
|||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
)
|
)
|
||||||
|
|
||||||
// "log"
|
// "log"
|
||||||
@@ -23,9 +24,8 @@ type DiskstatCollectorConfig struct {
|
|||||||
|
|
||||||
type DiskstatCollector struct {
|
type DiskstatCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
//matches map[string]int
|
config DiskstatCollectorConfig
|
||||||
config IOstatCollectorConfig
|
statsProcessedMetrics int64
|
||||||
//devices map[string]IOstatCollectorEntry
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *DiskstatCollector) Init(config json.RawMessage) error {
|
func (m *DiskstatCollector) Init(config json.RawMessage) error {
|
||||||
@@ -44,6 +44,7 @@ func (m *DiskstatCollector) Init(config json.RawMessage) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
defer file.Close()
|
defer file.Close()
|
||||||
|
m.statsProcessedMetrics = 0
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -89,12 +90,16 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMetric
|
|||||||
y, err := lp.New("disk_total", tags, m.meta, map[string]interface{}{"value": total}, time.Now())
|
y, err := lp.New("disk_total", tags, m.meta, map[string]interface{}{"value": total}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "GBytes")
|
y.AddMeta("unit", "GBytes")
|
||||||
|
m.statsProcessedMetrics++
|
||||||
|
stats.ComponentStatInt(m.name, "processed_metrics", m.statsProcessedMetrics)
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
free := (stat.Bfree * uint64(stat.Bsize)) / uint64(1000000000)
|
free := (stat.Bfree * uint64(stat.Bsize)) / uint64(1000000000)
|
||||||
y, err = lp.New("disk_free", tags, m.meta, map[string]interface{}{"value": free}, time.Now())
|
y, err = lp.New("disk_free", tags, m.meta, map[string]interface{}{"value": free}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "GBytes")
|
y.AddMeta("unit", "GBytes")
|
||||||
|
m.statsProcessedMetrics++
|
||||||
|
stats.ComponentStatInt(m.name, "processed_metrics", m.statsProcessedMetrics)
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
perc := (100 * (total - free)) / total
|
perc := (100 * (total - free)) / total
|
||||||
@@ -105,6 +110,8 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMetric
|
|||||||
y, err := lp.New("part_max_used", map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": int(part_max_used)}, time.Now())
|
y, err := lp.New("part_max_used", map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": int(part_max_used)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "percent")
|
y.AddMeta("unit", "percent")
|
||||||
|
m.statsProcessedMetrics++
|
||||||
|
stats.ComponentStatInt(m.name, "processed_metrics", m.statsProcessedMetrics)
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -15,6 +15,7 @@ import (
|
|||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
)
|
)
|
||||||
|
|
||||||
const DEFAULT_GPFS_CMD = "mmpmon"
|
const DEFAULT_GPFS_CMD = "mmpmon"
|
||||||
@@ -32,9 +33,10 @@ type GpfsCollector struct {
|
|||||||
ExcludeFilesystem []string `json:"exclude_filesystem,omitempty"`
|
ExcludeFilesystem []string `json:"exclude_filesystem,omitempty"`
|
||||||
SendBandwidths bool `json:"send_bandwidths"`
|
SendBandwidths bool `json:"send_bandwidths"`
|
||||||
}
|
}
|
||||||
skipFS map[string]struct{}
|
skipFS map[string]struct{}
|
||||||
lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths
|
lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths
|
||||||
lastState map[string]GpfsCollectorLastState
|
lastState map[string]GpfsCollectorLastState
|
||||||
|
statsProcessedMetrics int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *GpfsCollector) Init(config json.RawMessage) error {
|
func (m *GpfsCollector) Init(config json.RawMessage) error {
|
||||||
@@ -86,7 +88,7 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
|
|||||||
return fmt.Errorf("failed to find mmpmon binary '%s': %v", m.config.Mmpmon, err)
|
return fmt.Errorf("failed to find mmpmon binary '%s': %v", m.config.Mmpmon, err)
|
||||||
}
|
}
|
||||||
m.config.Mmpmon = p
|
m.config.Mmpmon = p
|
||||||
|
m.statsProcessedMetrics = 0
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -211,12 +213,14 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
|||||||
}
|
}
|
||||||
if y, err := lp.New("gpfs_bytes_read", m.tags, m.meta, map[string]interface{}{"value": bytesRead}, timestamp); err == nil {
|
if y, err := lp.New("gpfs_bytes_read", m.tags, m.meta, map[string]interface{}{"value": bytesRead}, timestamp); err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
if m.config.SendBandwidths {
|
if m.config.SendBandwidths {
|
||||||
if lastBytesRead := m.lastState[filesystem].bytesRead; lastBytesRead >= 0 {
|
if lastBytesRead := m.lastState[filesystem].bytesRead; lastBytesRead >= 0 {
|
||||||
bwRead := float64(bytesRead-lastBytesRead) / timeDiff
|
bwRead := float64(bytesRead-lastBytesRead) / timeDiff
|
||||||
if y, err := lp.New("gpfs_bw_read", m.tags, m.meta, map[string]interface{}{"value": bwRead}, timestamp); err == nil {
|
if y, err := lp.New("gpfs_bw_read", m.tags, m.meta, map[string]interface{}{"value": bwRead}, timestamp); err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -231,12 +235,14 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
|||||||
}
|
}
|
||||||
if y, err := lp.New("gpfs_bytes_written", m.tags, m.meta, map[string]interface{}{"value": bytesWritten}, timestamp); err == nil {
|
if y, err := lp.New("gpfs_bytes_written", m.tags, m.meta, map[string]interface{}{"value": bytesWritten}, timestamp); err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
if m.config.SendBandwidths {
|
if m.config.SendBandwidths {
|
||||||
if lastBytesWritten := m.lastState[filesystem].bytesRead; lastBytesWritten >= 0 {
|
if lastBytesWritten := m.lastState[filesystem].bytesRead; lastBytesWritten >= 0 {
|
||||||
bwWrite := float64(bytesWritten-lastBytesWritten) / timeDiff
|
bwWrite := float64(bytesWritten-lastBytesWritten) / timeDiff
|
||||||
if y, err := lp.New("gpfs_bw_write", m.tags, m.meta, map[string]interface{}{"value": bwWrite}, timestamp); err == nil {
|
if y, err := lp.New("gpfs_bw_write", m.tags, m.meta, map[string]interface{}{"value": bwWrite}, timestamp); err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -258,6 +264,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
|||||||
}
|
}
|
||||||
if y, err := lp.New("gpfs_num_opens", m.tags, m.meta, map[string]interface{}{"value": numOpens}, timestamp); err == nil {
|
if y, err := lp.New("gpfs_num_opens", m.tags, m.meta, map[string]interface{}{"value": numOpens}, timestamp); err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
|
|
||||||
// number of closes
|
// number of closes
|
||||||
@@ -270,6 +277,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
|||||||
}
|
}
|
||||||
if y, err := lp.New("gpfs_num_closes", m.tags, m.meta, map[string]interface{}{"value": numCloses}, timestamp); err == nil {
|
if y, err := lp.New("gpfs_num_closes", m.tags, m.meta, map[string]interface{}{"value": numCloses}, timestamp); err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
|
|
||||||
// number of reads
|
// number of reads
|
||||||
@@ -282,6 +290,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
|||||||
}
|
}
|
||||||
if y, err := lp.New("gpfs_num_reads", m.tags, m.meta, map[string]interface{}{"value": numReads}, timestamp); err == nil {
|
if y, err := lp.New("gpfs_num_reads", m.tags, m.meta, map[string]interface{}{"value": numReads}, timestamp); err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
|
|
||||||
// number of writes
|
// number of writes
|
||||||
@@ -294,6 +303,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
|||||||
}
|
}
|
||||||
if y, err := lp.New("gpfs_num_writes", m.tags, m.meta, map[string]interface{}{"value": numWrites}, timestamp); err == nil {
|
if y, err := lp.New("gpfs_num_writes", m.tags, m.meta, map[string]interface{}{"value": numWrites}, timestamp); err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
|
|
||||||
// number of read directories
|
// number of read directories
|
||||||
@@ -306,6 +316,7 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
|||||||
}
|
}
|
||||||
if y, err := lp.New("gpfs_num_readdirs", m.tags, m.meta, map[string]interface{}{"value": numReaddirs}, timestamp); err == nil {
|
if y, err := lp.New("gpfs_num_readdirs", m.tags, m.meta, map[string]interface{}{"value": numReaddirs}, timestamp); err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
|
|
||||||
// Number of inode updates
|
// Number of inode updates
|
||||||
@@ -317,9 +328,11 @@ func (m *GpfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if y, err := lp.New("gpfs_num_inode_updates", m.tags, m.meta, map[string]interface{}{"value": numInodeUpdates}, timestamp); err == nil {
|
if y, err := lp.New("gpfs_num_inode_updates", m.tags, m.meta, map[string]interface{}{"value": numInodeUpdates}, timestamp); err == nil {
|
||||||
|
m.statsProcessedMetrics++
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
stats.ComponentStatInt(m.name, "processed_metrics", m.statsProcessedMetrics)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *GpfsCollector) Close() {
|
func (m *GpfsCollector) Close() {
|
||||||
|
@@ -7,6 +7,7 @@ import (
|
|||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
"golang.org/x/sys/unix"
|
"golang.org/x/sys/unix"
|
||||||
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
@@ -18,13 +19,18 @@ import (
|
|||||||
|
|
||||||
const IB_BASEPATH = "/sys/class/infiniband/"
|
const IB_BASEPATH = "/sys/class/infiniband/"
|
||||||
|
|
||||||
|
type InfinibandCollectorMetric struct {
|
||||||
|
path string
|
||||||
|
unit string
|
||||||
|
}
|
||||||
|
|
||||||
type InfinibandCollectorInfo struct {
|
type InfinibandCollectorInfo struct {
|
||||||
LID string // IB local Identifier (LID)
|
LID string // IB local Identifier (LID)
|
||||||
device string // IB device
|
device string // IB device
|
||||||
port string // IB device port
|
port string // IB device port
|
||||||
portCounterFiles map[string]string // mapping counter name -> sysfs file
|
portCounterFiles map[string]InfinibandCollectorMetric // mapping counter name -> InfinibandCollectorMetric
|
||||||
tagSet map[string]string // corresponding tag list
|
tagSet map[string]string // corresponding tag list
|
||||||
lastState map[string]int64 // State from last measurement
|
lastState map[string]int64 // State from last measurement
|
||||||
}
|
}
|
||||||
|
|
||||||
type InfinibandCollector struct {
|
type InfinibandCollector struct {
|
||||||
@@ -34,8 +40,9 @@ type InfinibandCollector struct {
|
|||||||
SendAbsoluteValues bool `json:"send_abs_values"` // Send absolut values as read from sys filesystem
|
SendAbsoluteValues bool `json:"send_abs_values"` // Send absolut values as read from sys filesystem
|
||||||
SendDerivedValues bool `json:"send_derived_values"` // Send derived values e.g. rates
|
SendDerivedValues bool `json:"send_derived_values"` // Send derived values e.g. rates
|
||||||
}
|
}
|
||||||
info []*InfinibandCollectorInfo
|
info []*InfinibandCollectorInfo
|
||||||
lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths
|
lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths
|
||||||
|
statsProcessedMetrics int64
|
||||||
}
|
}
|
||||||
|
|
||||||
// Init initializes the Infiniband collector by walking through files below IB_BASEPATH
|
// Init initializes the Infiniband collector by walking through files below IB_BASEPATH
|
||||||
@@ -106,16 +113,16 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
|||||||
|
|
||||||
// Check access to counter files
|
// Check access to counter files
|
||||||
countersDir := filepath.Join(path, "counters")
|
countersDir := filepath.Join(path, "counters")
|
||||||
portCounterFiles := map[string]string{
|
portCounterFiles := map[string]InfinibandCollectorMetric{
|
||||||
"ib_recv": filepath.Join(countersDir, "port_rcv_data"),
|
"ib_recv": {path: filepath.Join(countersDir, "port_rcv_data"), unit: "bytes"},
|
||||||
"ib_xmit": filepath.Join(countersDir, "port_xmit_data"),
|
"ib_xmit": {path: filepath.Join(countersDir, "port_xmit_data"), unit: "bytes"},
|
||||||
"ib_recv_pkts": filepath.Join(countersDir, "port_rcv_packets"),
|
"ib_recv_pkts": {path: filepath.Join(countersDir, "port_rcv_packets"), unit: "packets"},
|
||||||
"ib_xmit_pkts": filepath.Join(countersDir, "port_xmit_packets"),
|
"ib_xmit_pkts": {path: filepath.Join(countersDir, "port_xmit_packets"), unit: "packets"},
|
||||||
}
|
}
|
||||||
for _, counterFile := range portCounterFiles {
|
for _, counter := range portCounterFiles {
|
||||||
err := unix.Access(counterFile, unix.R_OK)
|
err := unix.Access(counter.path, unix.R_OK)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("unable to access %s: %v", counterFile, err)
|
return fmt.Errorf("unable to access %s: %v", counter.path, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -144,7 +151,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
|
|||||||
if len(m.info) == 0 {
|
if len(m.info) == 0 {
|
||||||
return fmt.Errorf("found no IB devices")
|
return fmt.Errorf("found no IB devices")
|
||||||
}
|
}
|
||||||
|
m.statsProcessedMetrics = 0
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -165,14 +172,14 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
|
|||||||
m.lastTimestamp = now
|
m.lastTimestamp = now
|
||||||
|
|
||||||
for _, info := range m.info {
|
for _, info := range m.info {
|
||||||
for counterName, counterFile := range info.portCounterFiles {
|
for counterName, counterDef := range info.portCounterFiles {
|
||||||
|
|
||||||
// Read counter file
|
// Read counter file
|
||||||
line, err := ioutil.ReadFile(counterFile)
|
line, err := ioutil.ReadFile(counterDef.path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(
|
cclog.ComponentError(
|
||||||
m.name,
|
m.name,
|
||||||
fmt.Sprintf("Read(): Failed to read from file '%s': %v", counterFile, err))
|
fmt.Sprintf("Read(): Failed to read from file '%s': %v", counterDef.path, err))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
data := strings.TrimSpace(string(line))
|
data := strings.TrimSpace(string(line))
|
||||||
@@ -189,7 +196,9 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
|
|||||||
// Send absolut values
|
// Send absolut values
|
||||||
if m.config.SendAbsoluteValues {
|
if m.config.SendAbsoluteValues {
|
||||||
if y, err := lp.New(counterName, info.tagSet, m.meta, map[string]interface{}{"value": v}, now); err == nil {
|
if y, err := lp.New(counterName, info.tagSet, m.meta, map[string]interface{}{"value": v}, now); err == nil {
|
||||||
|
y.AddMeta("unit", counterDef.unit)
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -198,7 +207,9 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
|
|||||||
if info.lastState[counterName] >= 0 {
|
if info.lastState[counterName] >= 0 {
|
||||||
rate := float64((v - info.lastState[counterName])) / timeDiff
|
rate := float64((v - info.lastState[counterName])) / timeDiff
|
||||||
if y, err := lp.New(counterName+"_bw", info.tagSet, m.meta, map[string]interface{}{"value": rate}, now); err == nil {
|
if y, err := lp.New(counterName+"_bw", info.tagSet, m.meta, map[string]interface{}{"value": rate}, now); err == nil {
|
||||||
|
y.AddMeta("unit", counterDef.unit+"/sec")
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Save current state
|
// Save current state
|
||||||
@@ -207,6 +218,7 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMetr
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
stats.ComponentStatInt(m.name, "processed_metrics", m.statsProcessedMetrics)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *InfinibandCollector) Close() {
|
func (m *InfinibandCollector) Close() {
|
||||||
|
@@ -6,6 +6,7 @@ import (
|
|||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
|
|
||||||
// "log"
|
// "log"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
@@ -29,9 +30,10 @@ type IOstatCollectorEntry struct {
|
|||||||
|
|
||||||
type IOstatCollector struct {
|
type IOstatCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
matches map[string]int
|
matches map[string]int
|
||||||
config IOstatCollectorConfig
|
config IOstatCollectorConfig
|
||||||
devices map[string]IOstatCollectorEntry
|
devices map[string]IOstatCollectorEntry
|
||||||
|
statsProcessedMetrics int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *IOstatCollector) Init(config json.RawMessage) error {
|
func (m *IOstatCollector) Init(config json.RawMessage) error {
|
||||||
@@ -102,6 +104,7 @@ func (m *IOstatCollector) Init(config json.RawMessage) error {
|
|||||||
lastValues: values,
|
lastValues: values,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
m.statsProcessedMetrics = 0
|
||||||
m.init = true
|
m.init = true
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@@ -141,6 +144,7 @@ func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
y, err := lp.New(name, entry.tags, m.meta, map[string]interface{}{"value": int(diff)}, time.Now())
|
y, err := lp.New(name, entry.tags, m.meta, map[string]interface{}{"value": int(diff)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
entry.lastValues[name] = x
|
entry.lastValues[name] = x
|
||||||
@@ -148,6 +152,7 @@ func (m *IOstatCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
}
|
}
|
||||||
m.devices[device] = entry
|
m.devices[device] = entry
|
||||||
}
|
}
|
||||||
|
stats.ComponentStatInt(m.name, "processed_metrics", m.statsProcessedMetrics)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *IOstatCollector) Close() {
|
func (m *IOstatCollector) Close() {
|
||||||
|
@@ -11,6 +11,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
)
|
)
|
||||||
|
|
||||||
const IPMITOOL_PATH = `ipmitool`
|
const IPMITOOL_PATH = `ipmitool`
|
||||||
@@ -26,9 +27,10 @@ type IpmiCollector struct {
|
|||||||
metricCollector
|
metricCollector
|
||||||
//tags map[string]string
|
//tags map[string]string
|
||||||
//matches map[string]string
|
//matches map[string]string
|
||||||
config IpmiCollectorConfig
|
config IpmiCollectorConfig
|
||||||
ipmitool string
|
ipmitool string
|
||||||
ipmisensors string
|
ipmisensors string
|
||||||
|
statsProcessedMetrics int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *IpmiCollector) Init(config json.RawMessage) error {
|
func (m *IpmiCollector) Init(config json.RawMessage) error {
|
||||||
@@ -56,6 +58,7 @@ func (m *IpmiCollector) Init(config json.RawMessage) error {
|
|||||||
if len(m.ipmitool) == 0 && len(m.ipmisensors) == 0 {
|
if len(m.ipmitool) == 0 && len(m.ipmisensors) == 0 {
|
||||||
return errors.New("no IPMI reader found")
|
return errors.New("no IPMI reader found")
|
||||||
}
|
}
|
||||||
|
m.statsProcessedMetrics = 0
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -94,6 +97,7 @@ func (m *IpmiCollector) readIpmiTool(cmd string, output chan lp.CCMetric) {
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", unit)
|
y.AddMeta("unit", unit)
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -123,6 +127,7 @@ func (m *IpmiCollector) readIpmiSensors(cmd string, output chan lp.CCMetric) {
|
|||||||
y.AddMeta("unit", lv[4])
|
y.AddMeta("unit", lv[4])
|
||||||
}
|
}
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -141,6 +146,7 @@ func (m *IpmiCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
|||||||
m.readIpmiSensors(m.config.IpmisensorsPath, output)
|
m.readIpmiSensors(m.config.IpmisensorsPath, output)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
stats.ComponentStatInt(m.name, "processed_metrics", m.statsProcessedMetrics)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *IpmiCollector) Close() {
|
func (m *IpmiCollector) Close() {
|
||||||
|
@@ -15,8 +15,12 @@ import (
|
|||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"math"
|
"math"
|
||||||
"os"
|
"os"
|
||||||
|
"os/signal"
|
||||||
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
"unsafe"
|
"unsafe"
|
||||||
|
|
||||||
@@ -24,6 +28,7 @@ import (
|
|||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
topo "github.com/ClusterCockpit/cc-metric-collector/internal/ccTopology"
|
topo "github.com/ClusterCockpit/cc-metric-collector/internal/ccTopology"
|
||||||
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
|
agg "github.com/ClusterCockpit/cc-metric-collector/internal/metricAggregator"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
"github.com/NVIDIA/go-nvml/pkg/dl"
|
"github.com/NVIDIA/go-nvml/pkg/dl"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -46,6 +51,16 @@ type LikwidCollectorEventsetConfig struct {
|
|||||||
Metrics []LikwidCollectorMetricConfig `json:"metrics"`
|
Metrics []LikwidCollectorMetricConfig `json:"metrics"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type LikwidEventsetConfig struct {
|
||||||
|
internal int
|
||||||
|
gid C.int
|
||||||
|
eorder []*C.char
|
||||||
|
estr *C.char
|
||||||
|
go_estr string
|
||||||
|
results map[int]map[string]interface{}
|
||||||
|
metrics map[int]map[string]float64
|
||||||
|
}
|
||||||
|
|
||||||
type LikwidCollectorConfig struct {
|
type LikwidCollectorConfig struct {
|
||||||
Eventsets []LikwidCollectorEventsetConfig `json:"eventsets"`
|
Eventsets []LikwidCollectorEventsetConfig `json:"eventsets"`
|
||||||
Metrics []LikwidCollectorMetricConfig `json:"globalmetrics,omitempty"`
|
Metrics []LikwidCollectorMetricConfig `json:"globalmetrics,omitempty"`
|
||||||
@@ -58,17 +73,21 @@ type LikwidCollectorConfig struct {
|
|||||||
|
|
||||||
type LikwidCollector struct {
|
type LikwidCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
cpulist []C.int
|
cpulist []C.int
|
||||||
cpu2tid map[int]int
|
cpu2tid map[int]int
|
||||||
sock2tid map[int]int
|
sock2tid map[int]int
|
||||||
metrics map[C.int]map[string]int
|
metrics map[C.int]map[string]int
|
||||||
groups []C.int
|
groups []C.int
|
||||||
config LikwidCollectorConfig
|
config LikwidCollectorConfig
|
||||||
results map[int]map[int]map[string]interface{}
|
gmresults map[int]map[string]float64
|
||||||
mresults map[int]map[int]map[string]float64
|
basefreq float64
|
||||||
gmresults map[int]map[string]float64
|
running bool
|
||||||
basefreq float64
|
initialized bool
|
||||||
running bool
|
likwidGroups map[C.int]LikwidEventsetConfig
|
||||||
|
lock sync.Mutex
|
||||||
|
statsMeasurements int64
|
||||||
|
statsProcessedMetrics int64
|
||||||
|
statsPublishedMetrics int64
|
||||||
}
|
}
|
||||||
|
|
||||||
type LikwidMetric struct {
|
type LikwidMetric struct {
|
||||||
@@ -86,14 +105,60 @@ func eventsToEventStr(events map[string]string) string {
|
|||||||
return strings.Join(elist, ",")
|
return strings.Join(elist, ",")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func genLikwidEventSet(input LikwidCollectorEventsetConfig) LikwidEventsetConfig {
|
||||||
|
tmplist := make([]string, 0)
|
||||||
|
clist := make([]string, 0)
|
||||||
|
for k := range input.Events {
|
||||||
|
clist = append(clist, k)
|
||||||
|
}
|
||||||
|
sort.Strings(clist)
|
||||||
|
elist := make([]*C.char, 0)
|
||||||
|
for _, k := range clist {
|
||||||
|
v := input.Events[k]
|
||||||
|
tmplist = append(tmplist, fmt.Sprintf("%s:%s", v, k))
|
||||||
|
c_counter := C.CString(k)
|
||||||
|
elist = append(elist, c_counter)
|
||||||
|
}
|
||||||
|
estr := strings.Join(tmplist, ",")
|
||||||
|
res := make(map[int]map[string]interface{})
|
||||||
|
met := make(map[int]map[string]float64)
|
||||||
|
for _, i := range topo.CpuList() {
|
||||||
|
res[i] = make(map[string]interface{})
|
||||||
|
for k := range input.Events {
|
||||||
|
res[i][k] = 0.0
|
||||||
|
}
|
||||||
|
met[i] = make(map[string]float64)
|
||||||
|
for _, v := range input.Metrics {
|
||||||
|
res[i][v.Name] = 0.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return LikwidEventsetConfig{
|
||||||
|
gid: -1,
|
||||||
|
eorder: elist,
|
||||||
|
estr: C.CString(estr),
|
||||||
|
go_estr: estr,
|
||||||
|
results: res,
|
||||||
|
metrics: met,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func testLikwidMetricFormula(formula string, params []string) bool {
|
||||||
|
myparams := make(map[string]interface{})
|
||||||
|
for _, p := range params {
|
||||||
|
myparams[p] = float64(1.0)
|
||||||
|
}
|
||||||
|
_, err := agg.EvalFloat64Condition(formula, myparams)
|
||||||
|
return err == nil
|
||||||
|
}
|
||||||
|
|
||||||
func getBaseFreq() float64 {
|
func getBaseFreq() float64 {
|
||||||
|
files := []string{
|
||||||
|
"/sys/devices/system/cpu/cpu0/cpufreq/bios_limit",
|
||||||
|
"/sys/devices/system/cpu/cpu0/cpufreq/base_frequency",
|
||||||
|
}
|
||||||
var freq float64 = math.NaN()
|
var freq float64 = math.NaN()
|
||||||
C.power_init(0)
|
for _, f := range files {
|
||||||
info := C.get_powerInfo()
|
buffer, err := ioutil.ReadFile(f)
|
||||||
if float64(info.baseFrequency) != 0 {
|
|
||||||
freq = float64(info.baseFrequency) * 1e6
|
|
||||||
} else {
|
|
||||||
buffer, err := ioutil.ReadFile("/sys/devices/system/cpu/cpu0/cpufreq/bios_limit")
|
|
||||||
if err == nil {
|
if err == nil {
|
||||||
data := strings.Replace(string(buffer), "\n", "", -1)
|
data := strings.Replace(string(buffer), "\n", "", -1)
|
||||||
x, err := strconv.ParseInt(data, 0, 64)
|
x, err := strconv.ParseInt(data, 0, 64)
|
||||||
@@ -102,12 +167,22 @@ func getBaseFreq() float64 {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if math.IsNaN(freq) {
|
||||||
|
C.power_init(0)
|
||||||
|
info := C.get_powerInfo()
|
||||||
|
if float64(info.baseFrequency) != 0 {
|
||||||
|
freq = float64(info.baseFrequency) * 1e6
|
||||||
|
}
|
||||||
|
C.power_finalize()
|
||||||
|
}
|
||||||
return freq
|
return freq
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *LikwidCollector) Init(config json.RawMessage) error {
|
func (m *LikwidCollector) Init(config json.RawMessage) error {
|
||||||
var ret C.int
|
|
||||||
m.name = "LikwidCollector"
|
m.name = "LikwidCollector"
|
||||||
|
m.initialized = false
|
||||||
|
m.running = false
|
||||||
m.config.AccessMode = LIKWID_DEF_ACCESSMODE
|
m.config.AccessMode = LIKWID_DEF_ACCESSMODE
|
||||||
m.config.LibraryPath = LIKWID_LIB_NAME
|
m.config.LibraryPath = LIKWID_LIB_NAME
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
@@ -131,7 +206,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
m.setup()
|
m.setup()
|
||||||
|
|
||||||
m.meta = map[string]string{"source": m.name, "group": "PerfCounter"}
|
m.meta = map[string]string{"group": "PerfCounter"}
|
||||||
cclog.ComponentDebug(m.name, "Get cpulist and init maps and lists")
|
cclog.ComponentDebug(m.name, "Get cpulist and init maps and lists")
|
||||||
cpulist := topo.CpuList()
|
cpulist := topo.CpuList()
|
||||||
m.cpulist = make([]C.int, len(cpulist))
|
m.cpulist = make([]C.int, len(cpulist))
|
||||||
@@ -140,172 +215,138 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {
|
|||||||
m.cpulist[i] = C.int(c)
|
m.cpulist[i] = C.int(c)
|
||||||
m.cpu2tid[c] = i
|
m.cpu2tid[c] = i
|
||||||
}
|
}
|
||||||
m.sock2tid = make(map[int]int)
|
|
||||||
tmp := make([]C.int, 1)
|
m.likwidGroups = make(map[C.int]LikwidEventsetConfig)
|
||||||
for _, sid := range topo.SocketList() {
|
|
||||||
cstr := C.CString(fmt.Sprintf("S%d:0", sid))
|
// m.results = make(map[int]map[int]map[string]interface{})
|
||||||
ret = C.cpustr_to_cpulist(cstr, &tmp[0], 1)
|
// m.mresults = make(map[int]map[int]map[string]float64)
|
||||||
if ret > 0 {
|
|
||||||
m.sock2tid[sid] = m.cpu2tid[int(tmp[0])]
|
|
||||||
}
|
|
||||||
C.free(unsafe.Pointer(cstr))
|
|
||||||
}
|
|
||||||
m.results = make(map[int]map[int]map[string]interface{})
|
|
||||||
m.mresults = make(map[int]map[int]map[string]float64)
|
|
||||||
m.gmresults = make(map[int]map[string]float64)
|
m.gmresults = make(map[int]map[string]float64)
|
||||||
cclog.ComponentDebug(m.name, "initialize LIKWID topology")
|
for _, tid := range m.cpu2tid {
|
||||||
ret = C.topology_init()
|
m.gmresults[tid] = make(map[string]float64)
|
||||||
if ret != 0 {
|
|
||||||
err := errors.New("failed to initialize LIKWID topology")
|
|
||||||
cclog.ComponentError(m.name, err.Error())
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
switch m.config.AccessMode {
|
|
||||||
case "direct":
|
|
||||||
C.HPMmode(0)
|
|
||||||
case "accessdaemon":
|
|
||||||
if len(m.config.DaemonPath) > 0 {
|
|
||||||
p := os.Getenv("PATH")
|
|
||||||
os.Setenv("PATH", m.config.DaemonPath+":"+p)
|
|
||||||
}
|
|
||||||
C.HPMmode(1)
|
|
||||||
}
|
|
||||||
|
|
||||||
cclog.ComponentDebug(m.name, "initialize LIKWID perfmon module")
|
|
||||||
ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0])
|
|
||||||
if ret != 0 {
|
|
||||||
C.topology_finalize()
|
|
||||||
err := errors.New("failed to initialize LIKWID topology")
|
|
||||||
cclog.ComponentError(m.name, err.Error())
|
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// This is for the global metrics computation test
|
// This is for the global metrics computation test
|
||||||
globalParams := make(map[string]interface{})
|
totalMetrics := 0
|
||||||
globalParams["time"] = float64(1.0)
|
// Generate parameter list for the metric computing test
|
||||||
globalParams["inverseClock"] = float64(1.0)
|
params := make([]string, 0)
|
||||||
// While adding the events, we test the metrics whether they can be computed at all
|
params = append(params, "time", "inverseClock")
|
||||||
for i, evset := range m.config.Eventsets {
|
// Generate parameter list for the global metric computing test
|
||||||
var gid C.int
|
globalParams := make([]string, 0)
|
||||||
var cstr *C.char
|
globalParams = append(globalParams, "time", "inverseClock")
|
||||||
|
// We test the eventset metrics whether they can be computed at all
|
||||||
|
for _, evset := range m.config.Eventsets {
|
||||||
if len(evset.Events) > 0 {
|
if len(evset.Events) > 0 {
|
||||||
estr := eventsToEventStr(evset.Events)
|
params = params[:2]
|
||||||
// Generate parameter list for the metric computing test
|
|
||||||
params := make(map[string]interface{})
|
|
||||||
params["time"] = float64(1.0)
|
|
||||||
params["inverseClock"] = float64(1.0)
|
|
||||||
for counter := range evset.Events {
|
for counter := range evset.Events {
|
||||||
params[counter] = float64(1.0)
|
params = append(params, counter)
|
||||||
}
|
}
|
||||||
for _, metric := range evset.Metrics {
|
for _, metric := range evset.Metrics {
|
||||||
// Try to evaluate the metric
|
// Try to evaluate the metric
|
||||||
_, err := agg.EvalFloat64Condition(metric.Calc, params)
|
if testLikwidMetricFormula(metric.Calc, params) {
|
||||||
if err != nil {
|
// Add the computable metric to the parameter list for the global metrics
|
||||||
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error())
|
globalParams = append(globalParams, metric.Name)
|
||||||
continue
|
totalMetrics++
|
||||||
}
|
} else {
|
||||||
// If the metric is not in the parameter list for the global metrics, add it
|
metric.Calc = ""
|
||||||
if _, ok := globalParams[metric.Name]; !ok {
|
|
||||||
globalParams[metric.Name] = float64(1.0)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Now we add the list of events to likwid
|
|
||||||
cstr = C.CString(estr)
|
|
||||||
gid = C.perfmon_addEventSet(cstr)
|
|
||||||
} else {
|
} else {
|
||||||
cclog.ComponentError(m.name, "Invalid Likwid eventset config, no events given")
|
cclog.ComponentError(m.name, "Invalid Likwid eventset config, no events given")
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if gid >= 0 {
|
|
||||||
m.groups = append(m.groups, gid)
|
|
||||||
}
|
|
||||||
C.free(unsafe.Pointer(cstr))
|
|
||||||
m.results[i] = make(map[int]map[string]interface{})
|
|
||||||
m.mresults[i] = make(map[int]map[string]float64)
|
|
||||||
for tid := range m.cpulist {
|
|
||||||
m.results[i][tid] = make(map[string]interface{})
|
|
||||||
m.mresults[i][tid] = make(map[string]float64)
|
|
||||||
if i == 0 {
|
|
||||||
m.gmresults[tid] = make(map[string]float64)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
for _, metric := range m.config.Metrics {
|
for _, metric := range m.config.Metrics {
|
||||||
// Try to evaluate the global metric
|
// Try to evaluate the global metric
|
||||||
_, err := agg.EvalFloat64Condition(metric.Calc, globalParams)
|
if !testLikwidMetricFormula(metric.Calc, globalParams) {
|
||||||
if err != nil {
|
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed")
|
||||||
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error())
|
metric.Calc = ""
|
||||||
continue
|
} else {
|
||||||
|
totalMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If no event set could be added, shut down LikwidCollector
|
// If no event set could be added, shut down LikwidCollector
|
||||||
if len(m.groups) == 0 {
|
if totalMetrics == 0 {
|
||||||
C.perfmon_finalize()
|
err := errors.New("no LIKWID eventset or metric usable")
|
||||||
C.topology_finalize()
|
|
||||||
err := errors.New("no LIKWID performance group initialized")
|
|
||||||
cclog.ComponentError(m.name, err.Error())
|
cclog.ComponentError(m.name, err.Error())
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
m.basefreq = getBaseFreq()
|
m.statsMeasurements = 0
|
||||||
cclog.ComponentDebug(m.name, "BaseFreq", m.basefreq)
|
m.statsProcessedMetrics = 0
|
||||||
|
m.statsPublishedMetrics = 0
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// take a measurement for 'interval' seconds of event set index 'group'
|
// take a measurement for 'interval' seconds of event set index 'group'
|
||||||
func (m *LikwidCollector) takeMeasurement(group int, interval time.Duration) error {
|
func (m *LikwidCollector) takeMeasurement(evset LikwidEventsetConfig, interval time.Duration) (bool, error) {
|
||||||
var ret C.int
|
var ret C.int
|
||||||
gid := m.groups[group]
|
|
||||||
ret = C.perfmon_setupCounters(gid)
|
m.lock.Lock()
|
||||||
if ret != 0 {
|
if m.initialized {
|
||||||
gctr := C.GoString(C.perfmon_getGroupName(gid))
|
ret = C.perfmon_setupCounters(evset.gid)
|
||||||
err := fmt.Errorf("failed to setup performance group %d (%s)", gid, gctr)
|
if ret != 0 {
|
||||||
return err
|
var err error = nil
|
||||||
|
var skip bool = false
|
||||||
|
if ret == -37 {
|
||||||
|
skip = true
|
||||||
|
} else {
|
||||||
|
err = fmt.Errorf("failed to setup performance group %d", evset.gid)
|
||||||
|
}
|
||||||
|
m.lock.Unlock()
|
||||||
|
return skip, err
|
||||||
|
}
|
||||||
|
ret = C.perfmon_startCounters()
|
||||||
|
if ret != 0 {
|
||||||
|
var err error = nil
|
||||||
|
var skip bool = false
|
||||||
|
if ret == -37 {
|
||||||
|
skip = true
|
||||||
|
} else {
|
||||||
|
err = fmt.Errorf("failed to setup performance group %d", evset.gid)
|
||||||
|
}
|
||||||
|
m.lock.Unlock()
|
||||||
|
return skip, err
|
||||||
|
}
|
||||||
|
m.running = true
|
||||||
|
time.Sleep(interval)
|
||||||
|
m.running = false
|
||||||
|
ret = C.perfmon_stopCounters()
|
||||||
|
if ret != 0 {
|
||||||
|
var err error = nil
|
||||||
|
var skip bool = false
|
||||||
|
if ret == -37 {
|
||||||
|
skip = true
|
||||||
|
} else {
|
||||||
|
err = fmt.Errorf("failed to setup performance group %d", evset.gid)
|
||||||
|
}
|
||||||
|
m.lock.Unlock()
|
||||||
|
return skip, err
|
||||||
|
}
|
||||||
}
|
}
|
||||||
ret = C.perfmon_startCounters()
|
m.lock.Unlock()
|
||||||
if ret != 0 {
|
m.statsMeasurements++
|
||||||
gctr := C.GoString(C.perfmon_getGroupName(gid))
|
stats.ComponentStatInt(m.name, "measurements", m.statsMeasurements)
|
||||||
err := fmt.Errorf("failed to start performance group %d (%s)", gid, gctr)
|
return false, nil
|
||||||
return err
|
|
||||||
}
|
|
||||||
m.running = true
|
|
||||||
time.Sleep(interval)
|
|
||||||
m.running = false
|
|
||||||
ret = C.perfmon_stopCounters()
|
|
||||||
if ret != 0 {
|
|
||||||
gctr := C.GoString(C.perfmon_getGroupName(gid))
|
|
||||||
err := fmt.Errorf("failed to stop performance group %d (%s)", gid, gctr)
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get all measurement results for an event set, derive the metric values out of the measurement results and send it
|
// Get all measurement results for an event set, derive the metric values out of the measurement results and send it
|
||||||
func (m *LikwidCollector) calcEventsetMetrics(group int, interval time.Duration, output chan lp.CCMetric) error {
|
func (m *LikwidCollector) calcEventsetMetrics(evset LikwidEventsetConfig, interval time.Duration, output chan lp.CCMetric) error {
|
||||||
var eidx C.int
|
|
||||||
evset := m.config.Eventsets[group]
|
|
||||||
gid := m.groups[group]
|
|
||||||
invClock := float64(1.0 / m.basefreq)
|
invClock := float64(1.0 / m.basefreq)
|
||||||
|
|
||||||
// Go over events and get the results
|
// Go over events and get the results
|
||||||
for eidx = 0; int(eidx) < len(evset.Events); eidx++ {
|
for eidx, counter := range evset.eorder {
|
||||||
ctr := C.perfmon_getCounterName(gid, eidx)
|
gctr := C.GoString(counter)
|
||||||
gctr := C.GoString(ctr)
|
|
||||||
|
|
||||||
for _, tid := range m.cpu2tid {
|
for _, tid := range m.cpu2tid {
|
||||||
if tid >= 0 {
|
res := C.perfmon_getLastResult(evset.gid, C.int(eidx), C.int(tid))
|
||||||
m.results[group][tid]["time"] = interval.Seconds()
|
evset.results[tid][gctr] = float64(res)
|
||||||
m.results[group][tid]["inverseClock"] = invClock
|
evset.results[tid]["time"] = interval.Seconds()
|
||||||
res := C.perfmon_getLastResult(gid, eidx, C.int(tid))
|
evset.results[tid]["inverseClock"] = invClock
|
||||||
m.results[group][tid][gctr] = float64(res)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Go over the event set metrics, derive the value out of the event:counter values and send it
|
// Go over the event set metrics, derive the value out of the event:counter values and send it
|
||||||
for _, metric := range evset.Metrics {
|
for _, metric := range m.config.Eventsets[evset.internal].Metrics {
|
||||||
// The metric scope is determined in the Init() function
|
// The metric scope is determined in the Init() function
|
||||||
// Get the map scope-id -> tids
|
// Get the map scope-id -> tids
|
||||||
scopemap := m.cpu2tid
|
scopemap := m.cpu2tid
|
||||||
@@ -313,19 +354,21 @@ func (m *LikwidCollector) calcEventsetMetrics(group int, interval time.Duration,
|
|||||||
scopemap = m.sock2tid
|
scopemap = m.sock2tid
|
||||||
}
|
}
|
||||||
for domain, tid := range scopemap {
|
for domain, tid := range scopemap {
|
||||||
if tid >= 0 {
|
if tid >= 0 && len(metric.Calc) > 0 {
|
||||||
value, err := agg.EvalFloat64Condition(metric.Calc, m.results[group][tid])
|
value, err := agg.EvalFloat64Condition(metric.Calc, evset.results[tid])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error())
|
cclog.ComponentError(m.name, "Calculation for metric", metric.Name, "failed:", err.Error())
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
m.mresults[group][tid][metric.Name] = value
|
evset.metrics[tid][metric.Name] = value
|
||||||
if m.config.InvalidToZero && math.IsNaN(value) {
|
if m.config.InvalidToZero && math.IsNaN(value) {
|
||||||
value = 0.0
|
value = 0.0
|
||||||
}
|
}
|
||||||
if m.config.InvalidToZero && math.IsInf(value, 0) {
|
if m.config.InvalidToZero && math.IsInf(value, 0) {
|
||||||
value = 0.0
|
value = 0.0
|
||||||
}
|
}
|
||||||
|
m.statsProcessedMetrics++
|
||||||
|
stats.ComponentStatInt(m.name, "processed_metrics", m.statsProcessedMetrics)
|
||||||
// Now we have the result, send it with the proper tags
|
// Now we have the result, send it with the proper tags
|
||||||
if !math.IsNaN(value) {
|
if !math.IsNaN(value) {
|
||||||
if metric.Publish {
|
if metric.Publish {
|
||||||
@@ -338,6 +381,8 @@ func (m *LikwidCollector) calcEventsetMetrics(group int, interval time.Duration,
|
|||||||
if len(metric.Unit) > 0 {
|
if len(metric.Unit) > 0 {
|
||||||
y.AddMeta("unit", metric.Unit)
|
y.AddMeta("unit", metric.Unit)
|
||||||
}
|
}
|
||||||
|
m.statsPublishedMetrics++
|
||||||
|
stats.ComponentStatInt(m.name, "published_metrics", m.statsPublishedMetrics)
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -360,8 +405,8 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan
|
|||||||
if tid >= 0 {
|
if tid >= 0 {
|
||||||
// Here we generate parameter list
|
// Here we generate parameter list
|
||||||
params := make(map[string]interface{})
|
params := make(map[string]interface{})
|
||||||
for j := range m.groups {
|
for _, evset := range m.likwidGroups {
|
||||||
for mname, mres := range m.mresults[j][tid] {
|
for mname, mres := range evset.metrics[tid] {
|
||||||
params[mname] = mres
|
params[mname] = mres
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -378,6 +423,8 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan
|
|||||||
if m.config.InvalidToZero && math.IsInf(value, 0) {
|
if m.config.InvalidToZero && math.IsInf(value, 0) {
|
||||||
value = 0.0
|
value = 0.0
|
||||||
}
|
}
|
||||||
|
m.statsProcessedMetrics++
|
||||||
|
stats.ComponentStatInt(m.name, "processed_metrics", m.statsProcessedMetrics)
|
||||||
// Now we have the result, send it with the proper tags
|
// Now we have the result, send it with the proper tags
|
||||||
if !math.IsNaN(value) {
|
if !math.IsNaN(value) {
|
||||||
if metric.Publish {
|
if metric.Publish {
|
||||||
@@ -391,6 +438,8 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan
|
|||||||
if len(metric.Unit) > 0 {
|
if len(metric.Unit) > 0 {
|
||||||
y.AddMeta("unit", metric.Unit)
|
y.AddMeta("unit", metric.Unit)
|
||||||
}
|
}
|
||||||
|
m.statsPublishedMetrics++
|
||||||
|
stats.ComponentStatInt(m.name, "published_metrics", m.statsPublishedMetrics)
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -401,38 +450,163 @@ func (m *LikwidCollector) calcGlobalMetrics(interval time.Duration, output chan
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (m *LikwidCollector) LateInit() error {
|
||||||
|
var ret C.int
|
||||||
|
if m.initialized {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
switch m.config.AccessMode {
|
||||||
|
case "direct":
|
||||||
|
C.HPMmode(0)
|
||||||
|
case "accessdaemon":
|
||||||
|
if len(m.config.DaemonPath) > 0 {
|
||||||
|
p := os.Getenv("PATH")
|
||||||
|
os.Setenv("PATH", m.config.DaemonPath+":"+p)
|
||||||
|
}
|
||||||
|
C.HPMmode(1)
|
||||||
|
}
|
||||||
|
cclog.ComponentDebug(m.name, "initialize LIKWID topology")
|
||||||
|
ret = C.topology_init()
|
||||||
|
if ret != 0 {
|
||||||
|
err := errors.New("failed to initialize LIKWID topology")
|
||||||
|
cclog.ComponentError(m.name, err.Error())
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
m.sock2tid = make(map[int]int)
|
||||||
|
tmp := make([]C.int, 1)
|
||||||
|
for _, sid := range topo.SocketList() {
|
||||||
|
cstr := C.CString(fmt.Sprintf("S%d:0", sid))
|
||||||
|
ret = C.cpustr_to_cpulist(cstr, &tmp[0], 1)
|
||||||
|
if ret > 0 {
|
||||||
|
m.sock2tid[sid] = m.cpu2tid[int(tmp[0])]
|
||||||
|
}
|
||||||
|
C.free(unsafe.Pointer(cstr))
|
||||||
|
}
|
||||||
|
|
||||||
|
m.basefreq = getBaseFreq()
|
||||||
|
cclog.ComponentDebug(m.name, "BaseFreq", m.basefreq)
|
||||||
|
|
||||||
|
cclog.ComponentDebug(m.name, "initialize LIKWID perfmon module")
|
||||||
|
ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0])
|
||||||
|
if ret != 0 {
|
||||||
|
var err error = nil
|
||||||
|
C.topology_finalize()
|
||||||
|
if ret != -22 {
|
||||||
|
err = errors.New("failed to initialize LIKWID perfmon")
|
||||||
|
cclog.ComponentError(m.name, err.Error())
|
||||||
|
} else {
|
||||||
|
err = errors.New("access to LIKWID perfmon locked")
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// While adding the events, we test the metrics whether they can be computed at all
|
||||||
|
for i, evset := range m.config.Eventsets {
|
||||||
|
var gid C.int
|
||||||
|
if len(evset.Events) > 0 {
|
||||||
|
skip := false
|
||||||
|
likwidGroup := genLikwidEventSet(evset)
|
||||||
|
for _, g := range m.likwidGroups {
|
||||||
|
if likwidGroup.go_estr == g.go_estr {
|
||||||
|
skip = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if skip {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Now we add the list of events to likwid
|
||||||
|
gid = C.perfmon_addEventSet(likwidGroup.estr)
|
||||||
|
if gid >= 0 {
|
||||||
|
likwidGroup.gid = gid
|
||||||
|
likwidGroup.internal = i
|
||||||
|
m.likwidGroups[gid] = likwidGroup
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
cclog.ComponentError(m.name, "Invalid Likwid eventset config, no events given")
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// If no event set could be added, shut down LikwidCollector
|
||||||
|
if len(m.likwidGroups) == 0 {
|
||||||
|
C.perfmon_finalize()
|
||||||
|
C.topology_finalize()
|
||||||
|
err := errors.New("no LIKWID performance group initialized")
|
||||||
|
cclog.ComponentError(m.name, err.Error())
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
sigchan := make(chan os.Signal, 1)
|
||||||
|
signal.Notify(sigchan, syscall.SIGCHLD)
|
||||||
|
signal.Notify(sigchan, os.Interrupt)
|
||||||
|
go func() {
|
||||||
|
<-sigchan
|
||||||
|
|
||||||
|
signal.Stop(sigchan)
|
||||||
|
m.initialized = false
|
||||||
|
}()
|
||||||
|
m.initialized = true
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// main read function taking multiple measurement rounds, each 'interval' seconds long
|
// main read function taking multiple measurement rounds, each 'interval' seconds long
|
||||||
func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
func (m *LikwidCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
||||||
|
var skip bool = false
|
||||||
|
var err error
|
||||||
if !m.init {
|
if !m.init {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
for i := range m.groups {
|
if !m.initialized {
|
||||||
// measure event set 'i' for 'interval' seconds
|
m.lock.Lock()
|
||||||
err := m.takeMeasurement(i, interval)
|
err = m.LateInit()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.ComponentError(m.name, err.Error())
|
m.lock.Unlock()
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// read measurements and derive event set metrics
|
m.initialized = true
|
||||||
m.calcEventsetMetrics(i, interval, output)
|
m.lock.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
if m.initialized && !skip {
|
||||||
|
for _, evset := range m.likwidGroups {
|
||||||
|
if !skip {
|
||||||
|
// measure event set 'i' for 'interval' seconds
|
||||||
|
skip, err = m.takeMeasurement(evset, interval)
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentError(m.name, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !skip {
|
||||||
|
// read measurements and derive event set metrics
|
||||||
|
m.calcEventsetMetrics(evset, interval, output)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !skip {
|
||||||
|
// use the event set metrics to derive the global metrics
|
||||||
|
m.calcGlobalMetrics(interval, output)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// use the event set metrics to derive the global metrics
|
|
||||||
m.calcGlobalMetrics(interval, output)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *LikwidCollector) Close() {
|
func (m *LikwidCollector) Close() {
|
||||||
if m.init {
|
if m.init {
|
||||||
cclog.ComponentDebug(m.name, "Closing ...")
|
|
||||||
m.init = false
|
m.init = false
|
||||||
if m.running {
|
cclog.ComponentDebug(m.name, "Closing ...")
|
||||||
cclog.ComponentDebug(m.name, "Stopping counters")
|
m.lock.Lock()
|
||||||
C.perfmon_stopCounters()
|
if m.initialized {
|
||||||
|
cclog.ComponentDebug(m.name, "Finalize LIKWID perfmon module")
|
||||||
|
C.perfmon_finalize()
|
||||||
|
m.initialized = false
|
||||||
}
|
}
|
||||||
cclog.ComponentDebug(m.name, "Finalize LIKWID perfmon module")
|
m.lock.Unlock()
|
||||||
C.perfmon_finalize()
|
|
||||||
cclog.ComponentDebug(m.name, "Finalize LIKWID topology module")
|
cclog.ComponentDebug(m.name, "Finalize LIKWID topology module")
|
||||||
C.topology_finalize()
|
C.topology_finalize()
|
||||||
|
|
||||||
cclog.ComponentDebug(m.name, "Closing done")
|
cclog.ComponentDebug(m.name, "Closing done")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -10,6 +10,7 @@ import (
|
|||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
)
|
)
|
||||||
|
|
||||||
//
|
//
|
||||||
@@ -32,6 +33,7 @@ type LoadavgCollector struct {
|
|||||||
config struct {
|
config struct {
|
||||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||||
}
|
}
|
||||||
|
statsProcessedMetrics int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *LoadavgCollector) Init(config json.RawMessage) error {
|
func (m *LoadavgCollector) Init(config json.RawMessage) error {
|
||||||
@@ -63,6 +65,7 @@ func (m *LoadavgCollector) Init(config json.RawMessage) error {
|
|||||||
for i, name := range m.proc_matches {
|
for i, name := range m.proc_matches {
|
||||||
_, m.proc_skips[i] = stringArrayContains(m.config.ExcludeMetrics, name)
|
_, m.proc_skips[i] = stringArrayContains(m.config.ExcludeMetrics, name)
|
||||||
}
|
}
|
||||||
|
m.statsProcessedMetrics = 0
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -98,6 +101,7 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": x}, now)
|
y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": x}, now)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -117,9 +121,10 @@ func (m *LoadavgCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": x}, now)
|
y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": x}, now)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
stats.ComponentStatInt(m.name, "processed_metrics", m.statsProcessedMetrics)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *LoadavgCollector) Close() {
|
func (m *LoadavgCollector) Close() {
|
||||||
|
@@ -12,6 +12,7 @@ import (
|
|||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
)
|
)
|
||||||
|
|
||||||
const LUSTRE_SYSFS = `/sys/fs/lustre`
|
const LUSTRE_SYSFS = `/sys/fs/lustre`
|
||||||
@@ -37,13 +38,14 @@ type LustreMetricDefinition struct {
|
|||||||
|
|
||||||
type LustreCollector struct {
|
type LustreCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
tags map[string]string
|
tags map[string]string
|
||||||
config LustreCollectorConfig
|
config LustreCollectorConfig
|
||||||
lctl string
|
lctl string
|
||||||
sudoCmd string
|
sudoCmd string
|
||||||
lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths
|
lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths
|
||||||
definitions []LustreMetricDefinition // Combined list without excluded metrics
|
definitions []LustreMetricDefinition // Combined list without excluded metrics
|
||||||
stats map[string]map[string]int64 // Data for last value per device and metric
|
stats map[string]map[string]int64 // Data for last value per device and metric
|
||||||
|
statsProcessedMetrics int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *LustreCollector) getDeviceDataCommand(device string) []string {
|
func (m *LustreCollector) getDeviceDataCommand(device string) []string {
|
||||||
@@ -372,6 +374,7 @@ func (m *LustreCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.lastTimestamp = time.Now()
|
m.lastTimestamp = time.Now()
|
||||||
|
m.statsProcessedMetrics = 0
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -418,11 +421,13 @@ func (m *LustreCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
y.AddMeta("unit", def.unit)
|
y.AddMeta("unit", def.unit)
|
||||||
}
|
}
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
devData[def.name] = use_x
|
devData[def.name] = use_x
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.lastTimestamp = now
|
m.lastTimestamp = now
|
||||||
|
stats.ComponentStatInt(m.name, "processed_metrics", m.statsProcessedMetrics)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *LustreCollector) Close() {
|
func (m *LustreCollector) Close() {
|
||||||
|
@@ -14,6 +14,7 @@ import (
|
|||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
)
|
)
|
||||||
|
|
||||||
const MEMSTATFILE = "/proc/meminfo"
|
const MEMSTATFILE = "/proc/meminfo"
|
||||||
@@ -32,16 +33,22 @@ type MemstatCollectorNode struct {
|
|||||||
|
|
||||||
type MemstatCollector struct {
|
type MemstatCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
stats map[string]int64
|
stats map[string]int64
|
||||||
tags map[string]string
|
tags map[string]string
|
||||||
matches map[string]string
|
matches map[string]string
|
||||||
config MemstatCollectorConfig
|
config MemstatCollectorConfig
|
||||||
nodefiles map[int]MemstatCollectorNode
|
nodefiles map[int]MemstatCollectorNode
|
||||||
sendMemUsed bool
|
sendMemUsed bool
|
||||||
|
statsProcessedMetrics int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func getStats(filename string) map[string]float64 {
|
type MemstatStats struct {
|
||||||
stats := make(map[string]float64)
|
value float64
|
||||||
|
unit string
|
||||||
|
}
|
||||||
|
|
||||||
|
func getStats(filename string) map[string]MemstatStats {
|
||||||
|
stats := make(map[string]MemstatStats)
|
||||||
file, err := os.Open(filename)
|
file, err := os.Open(filename)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cclog.Error(err.Error())
|
cclog.Error(err.Error())
|
||||||
@@ -55,12 +62,18 @@ func getStats(filename string) map[string]float64 {
|
|||||||
if len(linefields) == 3 {
|
if len(linefields) == 3 {
|
||||||
v, err := strconv.ParseFloat(linefields[1], 64)
|
v, err := strconv.ParseFloat(linefields[1], 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
stats[strings.Trim(linefields[0], ":")] = v
|
stats[strings.Trim(linefields[0], ":")] = MemstatStats{
|
||||||
|
value: v,
|
||||||
|
unit: linefields[2],
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else if len(linefields) == 5 {
|
} else if len(linefields) == 5 {
|
||||||
v, err := strconv.ParseFloat(linefields[3], 64)
|
v, err := strconv.ParseFloat(linefields[3], 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
stats[strings.Trim(linefields[0], ":")] = v
|
stats[strings.Trim(linefields[0], ":")] = MemstatStats{
|
||||||
|
value: v,
|
||||||
|
unit: linefields[4],
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -78,7 +91,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.meta = map[string]string{"source": m.name, "group": "Memory", "unit": "GByte"}
|
m.meta = map[string]string{"source": m.name, "group": "Memory"}
|
||||||
m.stats = make(map[string]int64)
|
m.stats = make(map[string]int64)
|
||||||
m.matches = make(map[string]string)
|
m.matches = make(map[string]string)
|
||||||
m.tags = map[string]string{"type": "node"}
|
m.tags = map[string]string{"type": "node"}
|
||||||
@@ -142,6 +155,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
m.statsProcessedMetrics = 0
|
||||||
m.init = true
|
m.init = true
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@@ -151,30 +165,53 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
sendStats := func(stats map[string]float64, tags map[string]string) {
|
sendStats := func(stats map[string]MemstatStats, tags map[string]string) {
|
||||||
for match, name := range m.matches {
|
for match, name := range m.matches {
|
||||||
var value float64 = 0
|
var value float64 = 0
|
||||||
|
var unit string = ""
|
||||||
if v, ok := stats[match]; ok {
|
if v, ok := stats[match]; ok {
|
||||||
value = v
|
value = v.value
|
||||||
|
if len(v.unit) > 0 {
|
||||||
|
unit = v.unit
|
||||||
|
}
|
||||||
}
|
}
|
||||||
y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": value * 1e-6}, time.Now())
|
|
||||||
|
y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": value}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
if len(unit) > 0 {
|
||||||
|
y.AddMeta("unit", unit)
|
||||||
|
}
|
||||||
|
m.statsProcessedMetrics++
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if m.sendMemUsed {
|
if m.sendMemUsed {
|
||||||
memUsed := 0.0
|
memUsed := 0.0
|
||||||
|
unit := ""
|
||||||
if totalVal, total := stats["MemTotal"]; total {
|
if totalVal, total := stats["MemTotal"]; total {
|
||||||
if freeVal, free := stats["MemFree"]; free {
|
if freeVal, free := stats["MemFree"]; free {
|
||||||
if bufVal, buffers := stats["Buffers"]; buffers {
|
if bufVal, buffers := stats["Buffers"]; buffers {
|
||||||
if cacheVal, cached := stats["Cached"]; cached {
|
if cacheVal, cached := stats["Cached"]; cached {
|
||||||
memUsed = totalVal - (freeVal + bufVal + cacheVal)
|
memUsed = totalVal.value - (freeVal.value + bufVal.value + cacheVal.value)
|
||||||
|
if len(totalVal.unit) > 0 {
|
||||||
|
unit = totalVal.unit
|
||||||
|
} else if len(freeVal.unit) > 0 {
|
||||||
|
unit = freeVal.unit
|
||||||
|
} else if len(bufVal.unit) > 0 {
|
||||||
|
unit = bufVal.unit
|
||||||
|
} else if len(cacheVal.unit) > 0 {
|
||||||
|
unit = cacheVal.unit
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
y, err := lp.New("mem_used", tags, m.meta, map[string]interface{}{"value": memUsed * 1e-6}, time.Now())
|
y, err := lp.New("mem_used", tags, m.meta, map[string]interface{}{"value": memUsed}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
if len(unit) > 0 {
|
||||||
|
y.AddMeta("unit", unit)
|
||||||
|
}
|
||||||
|
m.statsProcessedMetrics++
|
||||||
output <- y
|
output <- y
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -191,6 +228,7 @@ func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
sendStats(stats, nodeConf.tags)
|
sendStats(stats, nodeConf.tags)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
stats.ComponentStatInt(m.name, "collected_metrics", m.statsProcessedMetrics)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *MemstatCollector) Close() {
|
func (m *MemstatCollector) Close() {
|
||||||
|
@@ -11,6 +11,7 @@ import (
|
|||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
)
|
)
|
||||||
|
|
||||||
const NETSTATFILE = "/proc/net/dev"
|
const NETSTATFILE = "/proc/net/dev"
|
||||||
@@ -32,9 +33,10 @@ type NetstatCollectorMetric struct {
|
|||||||
|
|
||||||
type NetstatCollector struct {
|
type NetstatCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
config NetstatCollectorConfig
|
config NetstatCollectorConfig
|
||||||
matches map[string][]NetstatCollectorMetric
|
matches map[string][]NetstatCollectorMetric
|
||||||
lastTimestamp time.Time
|
lastTimestamp time.Time
|
||||||
|
statsProcessedMetrics int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *NetstatCollector) Init(config json.RawMessage) error {
|
func (m *NetstatCollector) Init(config json.RawMessage) error {
|
||||||
@@ -148,6 +150,7 @@ func (m *NetstatCollector) Init(config json.RawMessage) error {
|
|||||||
if len(m.matches) == 0 {
|
if len(m.matches) == 0 {
|
||||||
return errors.New("no devices to collector metrics found")
|
return errors.New("no devices to collector metrics found")
|
||||||
}
|
}
|
||||||
|
m.statsProcessedMetrics = 0
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -198,6 +201,7 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
if m.config.SendAbsoluteValues {
|
if m.config.SendAbsoluteValues {
|
||||||
if y, err := lp.New(metric.name, metric.tags, metric.meta, map[string]interface{}{"value": v}, now); err == nil {
|
if y, err := lp.New(metric.name, metric.tags, metric.meta, map[string]interface{}{"value": v}, now); err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if m.config.SendDerivedValues {
|
if m.config.SendDerivedValues {
|
||||||
@@ -205,6 +209,7 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
rate := float64(v-metric.lastValue) / timeDiff
|
rate := float64(v-metric.lastValue) / timeDiff
|
||||||
if y, err := lp.New(metric.name+"_bw", metric.tags, metric.meta_rates, map[string]interface{}{"value": rate}, now); err == nil {
|
if y, err := lp.New(metric.name+"_bw", metric.tags, metric.meta_rates, map[string]interface{}{"value": rate}, now); err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
metric.lastValue = v
|
metric.lastValue = v
|
||||||
@@ -212,6 +217,7 @@ func (m *NetstatCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
stats.ComponentStatInt(m.name, "processed_metrics", m.statsProcessedMetrics)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *NetstatCollector) Close() {
|
func (m *NetstatCollector) Close() {
|
||||||
|
@@ -12,6 +12,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
)
|
)
|
||||||
|
|
||||||
// First part contains the code for the general NfsCollector.
|
// First part contains the code for the general NfsCollector.
|
||||||
@@ -32,11 +33,12 @@ type nfsCollector struct {
|
|||||||
Nfsstats string `json:"nfsstat"`
|
Nfsstats string `json:"nfsstat"`
|
||||||
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
||||||
}
|
}
|
||||||
data map[string]NfsCollectorData
|
data map[string]NfsCollectorData
|
||||||
|
statsProcessedMetrics int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *nfsCollector) initStats() error {
|
func (m *nfsCollector) initStats() error {
|
||||||
cmd := exec.Command(m.config.Nfsstats, `-l`)
|
cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`)
|
||||||
cmd.Wait()
|
cmd.Wait()
|
||||||
buffer, err := cmd.Output()
|
buffer, err := cmd.Output()
|
||||||
if err == nil {
|
if err == nil {
|
||||||
@@ -52,7 +54,7 @@ func (m *nfsCollector) initStats() error {
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
x := m.data[name]
|
x := m.data[name]
|
||||||
x.current = value
|
x.current = value
|
||||||
x.last = 0
|
x.last = value
|
||||||
m.data[name] = x
|
m.data[name] = x
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -63,7 +65,7 @@ func (m *nfsCollector) initStats() error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *nfsCollector) updateStats() error {
|
func (m *nfsCollector) updateStats() error {
|
||||||
cmd := exec.Command(m.config.Nfsstats, `-l`)
|
cmd := exec.Command(m.config.Nfsstats, `-l`, `--all`)
|
||||||
cmd.Wait()
|
cmd.Wait()
|
||||||
buffer, err := cmd.Output()
|
buffer, err := cmd.Output()
|
||||||
if err == nil {
|
if err == nil {
|
||||||
@@ -113,6 +115,7 @@ func (m *nfsCollector) MainInit(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
m.data = make(map[string]NfsCollectorData)
|
m.data = make(map[string]NfsCollectorData)
|
||||||
m.initStats()
|
m.initStats()
|
||||||
|
m.statsProcessedMetrics = 0
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -143,8 +146,10 @@ func (m *nfsCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("version", m.version)
|
y.AddMeta("version", m.version)
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
stats.ComponentStatInt(m.name, "processed_metrics", m.statsProcessedMetrics)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *nfsCollector) Close() {
|
func (m *nfsCollector) Close() {
|
||||||
|
@@ -12,6 +12,7 @@ import (
|
|||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
)
|
)
|
||||||
|
|
||||||
//
|
//
|
||||||
@@ -44,7 +45,8 @@ type NUMAStatsCollectorTopolgy struct {
|
|||||||
|
|
||||||
type NUMAStatsCollector struct {
|
type NUMAStatsCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
topology []NUMAStatsCollectorTopolgy
|
topology []NUMAStatsCollectorTopolgy
|
||||||
|
statsProcessedMetrics int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
|
func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
|
||||||
@@ -80,7 +82,7 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
|
|||||||
tagSet: map[string]string{"memoryDomain": node},
|
tagSet: map[string]string{"memoryDomain": node},
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
m.statsProcessedMetrics = 0
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -127,11 +129,13 @@ func (m *NUMAStatsCollector) Read(interval time.Duration, output chan lp.CCMetri
|
|||||||
)
|
)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
file.Close()
|
file.Close()
|
||||||
}
|
}
|
||||||
|
stats.ComponentStatInt(m.name, "collected_metrics", m.statsProcessedMetrics)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *NUMAStatsCollector) Close() {
|
func (m *NUMAStatsCollector) Close() {
|
||||||
|
@@ -9,6 +9,7 @@ import (
|
|||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -26,9 +27,10 @@ type NvidiaCollectorDevice struct {
|
|||||||
|
|
||||||
type NvidiaCollector struct {
|
type NvidiaCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
num_gpus int
|
num_gpus int
|
||||||
config NvidiaCollectorConfig
|
config NvidiaCollectorConfig
|
||||||
gpus []NvidiaCollectorDevice
|
gpus []NvidiaCollectorDevice
|
||||||
|
statsProcessedMetrics int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *NvidiaCollector) CatchPanic() {
|
func (m *NvidiaCollector) CatchPanic() {
|
||||||
@@ -120,7 +122,7 @@ func (m *NvidiaCollector) Init(config json.RawMessage) error {
|
|||||||
pciInfo.Device)
|
pciInfo.Device)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
m.statsProcessedMetrics = 0
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -151,6 +153,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "%")
|
y.AddMeta("unit", "%")
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !device.excludeMetrics["nv_mem_util"] {
|
if !device.excludeMetrics["nv_mem_util"] {
|
||||||
@@ -158,6 +161,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "%")
|
y.AddMeta("unit", "%")
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -186,6 +190,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MByte")
|
y.AddMeta("unit", "MByte")
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -195,6 +200,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MByte")
|
y.AddMeta("unit", "MByte")
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -212,6 +218,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "degC")
|
y.AddMeta("unit", "degC")
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -232,6 +239,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "%")
|
y.AddMeta("unit", "%")
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -258,11 +266,13 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
}
|
}
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
} else if ret == nvml.ERROR_NOT_SUPPORTED {
|
} else if ret == nvml.ERROR_NOT_SUPPORTED {
|
||||||
y, err := lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": "N/A"}, time.Now())
|
y, err := lp.New("nv_ecc_mode", device.tags, m.meta, map[string]interface{}{"value": "N/A"}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -280,6 +290,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
y, err := lp.New("nv_perf_state", device.tags, m.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pState))}, time.Now())
|
y, err := lp.New("nv_perf_state", device.tags, m.meta, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pState))}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -296,6 +307,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "watts")
|
y.AddMeta("unit", "watts")
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -313,6 +325,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MHz")
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -324,6 +337,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MHz")
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -335,6 +349,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MHz")
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -357,6 +372,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MHz")
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -368,6 +384,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MHz")
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -379,6 +396,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "MHz")
|
y.AddMeta("unit", "MHz")
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -398,6 +416,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
y, err := lp.New("nv_ecc_db_error", device.tags, m.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
|
y, err := lp.New("nv_ecc_db_error", device.tags, m.meta, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -408,6 +427,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
y, err := lp.New("nv_ecc_sb_error", device.tags, m.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
|
y, err := lp.New("nv_ecc_sb_error", device.tags, m.meta, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -425,6 +445,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "watts")
|
y.AddMeta("unit", "watts")
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -441,6 +462,7 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "%")
|
y.AddMeta("unit", "%")
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -457,11 +479,12 @@ func (m *NvidiaCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
y.AddMeta("unit", "%")
|
y.AddMeta("unit", "%")
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
stats.ComponentStatInt(m.name, "collected_metrics", m.statsProcessedMetrics)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *NvidiaCollector) Close() {
|
func (m *NvidiaCollector) Close() {
|
||||||
|
@@ -6,6 +6,7 @@ import (
|
|||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
)
|
)
|
||||||
|
|
||||||
// These are the fields we read from the JSON configuration
|
// These are the fields we read from the JSON configuration
|
||||||
@@ -17,9 +18,10 @@ type SampleCollectorConfig struct {
|
|||||||
// defined by metricCollector (name, init, ...)
|
// defined by metricCollector (name, init, ...)
|
||||||
type SampleCollector struct {
|
type SampleCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
config SampleTimerCollectorConfig // the configuration structure
|
config SampleTimerCollectorConfig // the configuration structure
|
||||||
meta map[string]string // default meta information
|
meta map[string]string // default meta information
|
||||||
tags map[string]string // default tags
|
tags map[string]string // default tags
|
||||||
|
statsCount int64
|
||||||
}
|
}
|
||||||
|
|
||||||
// Functions to implement MetricCollector interface
|
// Functions to implement MetricCollector interface
|
||||||
@@ -58,6 +60,9 @@ func (m *SampleCollector) Init(config json.RawMessage) error {
|
|||||||
// for all topological entities (sockets, NUMA domains, ...)
|
// for all topological entities (sockets, NUMA domains, ...)
|
||||||
// Return some useful error message in case of any failures
|
// Return some useful error message in case of any failures
|
||||||
|
|
||||||
|
// Initialize counts for statistics
|
||||||
|
m.statsCount = 0
|
||||||
|
|
||||||
// Set this flag only if everything is initialized properly, all required files exist, ...
|
// Set this flag only if everything is initialized properly, all required files exist, ...
|
||||||
m.init = true
|
m.init = true
|
||||||
return err
|
return err
|
||||||
@@ -80,8 +85,11 @@ func (m *SampleCollector) Read(interval time.Duration, output chan lp.CCMetric)
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
// Send it to output channel
|
// Send it to output channel
|
||||||
output <- y
|
output <- y
|
||||||
|
// increment count for each sent metric or any other operation
|
||||||
|
m.statsCount++
|
||||||
}
|
}
|
||||||
|
// Set stats for the component
|
||||||
|
stats.ComponentStatInt(m.name, "count", m.statsCount)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Close metric collector: close network connection, close files, close libraries, ...
|
// Close metric collector: close network connection, close files, close libraries, ...
|
||||||
|
@@ -11,6 +11,7 @@ import (
|
|||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
)
|
)
|
||||||
|
|
||||||
// See: https://www.kernel.org/doc/html/latest/hwmon/sysfs-interface.html
|
// See: https://www.kernel.org/doc/html/latest/hwmon/sysfs-interface.html
|
||||||
@@ -40,7 +41,8 @@ type TempCollector struct {
|
|||||||
ReportMaxTemp bool `json:"report_max_temperature"`
|
ReportMaxTemp bool `json:"report_max_temperature"`
|
||||||
ReportCriticalTemp bool `json:"report_critical_temperature"`
|
ReportCriticalTemp bool `json:"report_critical_temperature"`
|
||||||
}
|
}
|
||||||
sensors []*TempCollectorSensor
|
sensors []*TempCollectorSensor
|
||||||
|
statsProcessedMetrics int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *TempCollector) Init(config json.RawMessage) error {
|
func (m *TempCollector) Init(config json.RawMessage) error {
|
||||||
@@ -162,6 +164,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Finished initialization
|
// Finished initialization
|
||||||
|
m.statsProcessedMetrics = 0
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -194,6 +197,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
|||||||
)
|
)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
|
|
||||||
// max temperature
|
// max temperature
|
||||||
@@ -207,6 +211,7 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
|||||||
)
|
)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -221,10 +226,11 @@ func (m *TempCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
|||||||
)
|
)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
stats.ComponentStatInt(m.name, "processed_metrics", m.statsProcessedMetrics)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *TempCollector) Close() {
|
func (m *TempCollector) Close() {
|
||||||
|
@@ -10,6 +10,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
)
|
)
|
||||||
|
|
||||||
const MAX_NUM_PROCS = 10
|
const MAX_NUM_PROCS = 10
|
||||||
@@ -21,8 +22,9 @@ type TopProcsCollectorConfig struct {
|
|||||||
|
|
||||||
type TopProcsCollector struct {
|
type TopProcsCollector struct {
|
||||||
metricCollector
|
metricCollector
|
||||||
tags map[string]string
|
tags map[string]string
|
||||||
config TopProcsCollectorConfig
|
config TopProcsCollectorConfig
|
||||||
|
statsProcessedMetrics int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *TopProcsCollector) Init(config json.RawMessage) error {
|
func (m *TopProcsCollector) Init(config json.RawMessage) error {
|
||||||
@@ -48,6 +50,7 @@ func (m *TopProcsCollector) Init(config json.RawMessage) error {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return errors.New("failed to execute command")
|
return errors.New("failed to execute command")
|
||||||
}
|
}
|
||||||
|
m.statsProcessedMetrics = 0
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -70,8 +73,10 @@ func (m *TopProcsCollector) Read(interval time.Duration, output chan lp.CCMetric
|
|||||||
y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": string(lines[i])}, time.Now())
|
y, err := lp.New(name, m.tags, m.meta, map[string]interface{}{"value": string(lines[i])}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
output <- y
|
output <- y
|
||||||
|
m.statsProcessedMetrics++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
stats.ComponentStatInt(m.name, "processed_metrics", m.statsProcessedMetrics)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *TopProcsCollector) Close() {
|
func (m *TopProcsCollector) Close() {
|
||||||
|
@@ -169,7 +169,10 @@ func DieList() []int {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return dielist
|
if len(dielist) > 0 {
|
||||||
|
return dielist
|
||||||
|
}
|
||||||
|
return SocketList()
|
||||||
}
|
}
|
||||||
|
|
||||||
type CpuEntry struct {
|
type CpuEntry struct {
|
||||||
@@ -261,7 +264,7 @@ func CpuData() []CpuEntry {
|
|||||||
for _, c := range CpuList() {
|
for _, c := range CpuList() {
|
||||||
clist = append(clist, CpuEntry{Cpuid: c})
|
clist = append(clist, CpuEntry{Cpuid: c})
|
||||||
}
|
}
|
||||||
for _, centry := range clist {
|
for i, centry := range clist {
|
||||||
centry.Socket = -1
|
centry.Socket = -1
|
||||||
centry.Numadomain = -1
|
centry.Numadomain = -1
|
||||||
centry.Die = -1
|
centry.Die = -1
|
||||||
@@ -289,6 +292,8 @@ func CpuData() []CpuEntry {
|
|||||||
// Lookup NUMA domain id
|
// Lookup NUMA domain id
|
||||||
centry.Numadomain = getNumaDomain(base)
|
centry.Numadomain = getNumaDomain(base)
|
||||||
|
|
||||||
|
// Update values in output list
|
||||||
|
clist[i] = centry
|
||||||
}
|
}
|
||||||
return clist
|
return clist
|
||||||
}
|
}
|
||||||
|
@@ -8,6 +8,8 @@ The CCMetric router sits in between the collectors and the sinks and can be used
|
|||||||
{
|
{
|
||||||
"num_cache_intervals" : 1,
|
"num_cache_intervals" : 1,
|
||||||
"interval_timestamp" : true,
|
"interval_timestamp" : true,
|
||||||
|
"hostname_tag" : "hostname",
|
||||||
|
"max_forward" : 50,
|
||||||
"add_tags" : [
|
"add_tags" : [
|
||||||
{
|
{
|
||||||
"key" : "cluster",
|
"key" : "cluster",
|
||||||
@@ -55,6 +57,20 @@ The CCMetric router sits in between the collectors and the sinks and can be used
|
|||||||
```
|
```
|
||||||
|
|
||||||
There are three main options `add_tags`, `delete_tags` and `interval_timestamp`. `add_tags` and `delete_tags` are lists consisting of dicts with `key`, `value` and `if`. The `value` can be omitted in the `delete_tags` part as it only uses the `key` for removal. The `interval_timestamp` setting means that a unique timestamp is applied to all metrics traversing the router during an interval.
|
There are three main options `add_tags`, `delete_tags` and `interval_timestamp`. `add_tags` and `delete_tags` are lists consisting of dicts with `key`, `value` and `if`. The `value` can be omitted in the `delete_tags` part as it only uses the `key` for removal. The `interval_timestamp` setting means that a unique timestamp is applied to all metrics traversing the router during an interval.
|
||||||
|
|
||||||
|
# Processing order in the router
|
||||||
|
|
||||||
|
- Add the `hostname_tag` tag (if sent by collectors or cache)
|
||||||
|
- If `interval_timestamp == true`, change time of metrics
|
||||||
|
- Check if metric should be dropped (`drop_metrics` and `drop_metrics_if`)
|
||||||
|
- Add tags from `add_tags`
|
||||||
|
- Delete tags from `del_tags`
|
||||||
|
- Rename metric based on `rename_metrics` and store old name as `oldname` in meta information
|
||||||
|
- Add tags from `add_tags` (if you used the new name in the `if` condition)
|
||||||
|
- Delete tags from `del_tags` (if you used the new name in the `if` condition)
|
||||||
|
- Send to sinks
|
||||||
|
- Move to cache (if `num_cache_intervals > 0`)
|
||||||
|
|
||||||
# The `interval_timestamp` option
|
# The `interval_timestamp` option
|
||||||
|
|
||||||
The collectors' `Read()` functions are not called simultaneously and therefore the metrics gathered in an interval can have different timestamps. If you want to avoid that and have a common timestamp (the beginning of the interval), set this option to `true` and the MetricRouter sets the time.
|
The collectors' `Read()` functions are not called simultaneously and therefore the metrics gathered in an interval can have different timestamps. If you want to avoid that and have a common timestamp (the beginning of the interval), set this option to `true` and the MetricRouter sets the time.
|
||||||
@@ -65,6 +81,14 @@ If the MetricRouter should buffer metrics of intervals in a MetricCache, this op
|
|||||||
|
|
||||||
A `num_cache_intervals > 0` is required to use the `interval_aggregates` option.
|
A `num_cache_intervals > 0` is required to use the `interval_aggregates` option.
|
||||||
|
|
||||||
|
# The `hostname_tag` option
|
||||||
|
|
||||||
|
By default, the router tags metrics with the hostname for all locally created metrics. The default tag name is `hostname`, but it can be changed if your organization wants anything else
|
||||||
|
|
||||||
|
# The `max_forward` option
|
||||||
|
|
||||||
|
Every time the router receives a metric through any of the channels, it tries to directly read up to `max_forward` metrics from the same channel. This was done as the router thread would go to sleep and wake up with every arriving metric. The default are `50` metrics at once and `max_forward` needs to greater than `1`.
|
||||||
|
|
||||||
# The `rename_metrics` option
|
# The `rename_metrics` option
|
||||||
|
|
||||||
In the ClusterCockpit world we specified a set of standard metrics. Since some collectors determine the metric names based on files, execuables and libraries, they might change from system to system (or installation to installtion, OS to OS, ...). In order to get the common names, you can rename incoming metrics before sending them to the sink. If the metric name matches the `oldname`, it is changed to `newname`
|
In the ClusterCockpit world we specified a set of standard metrics. Since some collectors determine the metric names based on files, execuables and libraries, they might change from system to system (or installation to installtion, OS to OS, ...). In order to get the common names, you can rename incoming metrics before sending them to the sink. If the metric name matches the `oldname`, it is changed to `newname`
|
||||||
|
17
internal/metricRouter/StatsApi.md
Normal file
17
internal/metricRouter/StatsApi.md
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
# Stats API
|
||||||
|
|
||||||
|
The Stats API can be used for debugging. It publishes counts at an HTTP endpoint as JSON from different componenets of the CC Metric Collector.
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
|
||||||
|
The Stats API has an own configuration file to specify the listen host and port. The defaults are `localhost` and `8080`.
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"bindhost" : "",
|
||||||
|
"port" : "8080",
|
||||||
|
"publish_collectorstate" : true
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
The `bindhost` and `port` can be used to specify the listen host and port. The `publish_collectorstate` needs to be `true`, otherwise nothing is presented. This option is for future use if we need to publish more infos using different domains.
|
232
internal/metricRouter/metricApi.go
Normal file
232
internal/metricRouter/metricApi.go
Normal file
@@ -0,0 +1,232 @@
|
|||||||
|
package metricRouter
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"net/http"
|
||||||
|
"os"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||||
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
mct "github.com/ClusterCockpit/cc-metric-collector/internal/multiChanTicker"
|
||||||
|
"github.com/gorilla/mux"
|
||||||
|
)
|
||||||
|
|
||||||
|
type statsApiConfig struct {
|
||||||
|
PublishCollectorState bool `json:"publish_collectorstate"`
|
||||||
|
Host string `json:"bindhost"`
|
||||||
|
Port string `json:"port"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// Metric cache data structure
|
||||||
|
type statsApi struct {
|
||||||
|
name string
|
||||||
|
input chan lp.CCMetric
|
||||||
|
indone chan bool
|
||||||
|
outdone chan bool
|
||||||
|
config statsApiConfig
|
||||||
|
wg *sync.WaitGroup
|
||||||
|
statsWg sync.WaitGroup
|
||||||
|
ticker mct.MultiChanTicker
|
||||||
|
tickchan chan time.Time
|
||||||
|
server *http.Server
|
||||||
|
router *mux.Router
|
||||||
|
lock sync.Mutex
|
||||||
|
baseurl string
|
||||||
|
stats map[string]map[string]int64
|
||||||
|
outStats map[string]map[string]int64
|
||||||
|
}
|
||||||
|
|
||||||
|
type StatsApi interface {
|
||||||
|
Start()
|
||||||
|
Close()
|
||||||
|
StatsFunc(w http.ResponseWriter, r *http.Request)
|
||||||
|
}
|
||||||
|
|
||||||
|
var statsApiServer *statsApi = nil
|
||||||
|
|
||||||
|
func (a *statsApi) updateStats(point lp.CCMetric) {
|
||||||
|
switch point.Name() {
|
||||||
|
case "_stats":
|
||||||
|
if name, nok := point.GetMeta("source"); nok {
|
||||||
|
var compStats map[string]int64
|
||||||
|
var ok bool
|
||||||
|
|
||||||
|
if compStats, ok = a.stats[name]; !ok {
|
||||||
|
a.stats[name] = make(map[string]int64)
|
||||||
|
compStats = a.stats[name]
|
||||||
|
}
|
||||||
|
for k, v := range point.Fields() {
|
||||||
|
switch value := v.(type) {
|
||||||
|
case int:
|
||||||
|
compStats[k] = int64(value)
|
||||||
|
case uint:
|
||||||
|
compStats[k] = int64(value)
|
||||||
|
case int32:
|
||||||
|
compStats[k] = int64(value)
|
||||||
|
case uint32:
|
||||||
|
compStats[k] = int64(value)
|
||||||
|
case int64:
|
||||||
|
compStats[k] = int64(value)
|
||||||
|
case uint64:
|
||||||
|
compStats[k] = int64(value)
|
||||||
|
default:
|
||||||
|
cclog.ComponentDebug(a.name, "Unusable stats for", k, ". Values should be int64")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
a.stats[name] = compStats
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *statsApi) Start() {
|
||||||
|
a.ticker.AddChannel(a.tickchan)
|
||||||
|
a.wg.Add(1)
|
||||||
|
a.statsWg.Add(1)
|
||||||
|
go func() {
|
||||||
|
a.stats = make(map[string]map[string]int64)
|
||||||
|
defer a.statsWg.Done()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-a.indone:
|
||||||
|
cclog.ComponentDebug(a.name, "INPUT DONE")
|
||||||
|
close(a.indone)
|
||||||
|
return
|
||||||
|
case p := <-a.input:
|
||||||
|
a.lock.Lock()
|
||||||
|
a.updateStats(p)
|
||||||
|
a.lock.Unlock()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
a.statsWg.Add(1)
|
||||||
|
go func() {
|
||||||
|
a.outStats = make(map[string]map[string]int64)
|
||||||
|
defer a.statsWg.Done()
|
||||||
|
a.lock.Lock()
|
||||||
|
for comp, compData := range a.stats {
|
||||||
|
var outData map[string]int64
|
||||||
|
var ok bool
|
||||||
|
if outData, ok = a.outStats[comp]; !ok {
|
||||||
|
outData = make(map[string]int64)
|
||||||
|
}
|
||||||
|
for k, v := range compData {
|
||||||
|
outData[k] = v
|
||||||
|
}
|
||||||
|
a.outStats[comp] = outData
|
||||||
|
}
|
||||||
|
a.lock.Unlock()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-a.outdone:
|
||||||
|
cclog.ComponentDebug(a.name, "OUTPUT DONE")
|
||||||
|
close(a.outdone)
|
||||||
|
return
|
||||||
|
case <-a.tickchan:
|
||||||
|
a.lock.Lock()
|
||||||
|
for comp, compData := range a.stats {
|
||||||
|
var outData map[string]int64
|
||||||
|
var ok bool
|
||||||
|
if outData, ok = a.outStats[comp]; !ok {
|
||||||
|
outData = make(map[string]int64)
|
||||||
|
}
|
||||||
|
for k, v := range compData {
|
||||||
|
outData[k] = v
|
||||||
|
}
|
||||||
|
a.outStats[comp] = outData
|
||||||
|
}
|
||||||
|
a.lock.Unlock()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
a.statsWg.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer a.statsWg.Done()
|
||||||
|
err := a.server.ListenAndServe()
|
||||||
|
if err != nil && err.Error() != "http: Server closed" {
|
||||||
|
cclog.ComponentError(a.name, err.Error())
|
||||||
|
}
|
||||||
|
cclog.ComponentDebug(a.name, "SERVER DONE")
|
||||||
|
}()
|
||||||
|
cclog.ComponentDebug(a.name, "STARTED")
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *statsApi) StatsFunc(w http.ResponseWriter, r *http.Request) {
|
||||||
|
data, err := json.Marshal(a.outStats)
|
||||||
|
if err == nil {
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
io.WriteString(w, string(data))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Close finishes / stops the metric cache
|
||||||
|
func (a *statsApi) Close() {
|
||||||
|
cclog.ComponentDebug(a.name, "CLOSE")
|
||||||
|
a.indone <- true
|
||||||
|
a.outdone <- true
|
||||||
|
a.server.Shutdown(context.Background())
|
||||||
|
// wait for close of channel r.done
|
||||||
|
<-a.indone
|
||||||
|
<-a.outdone
|
||||||
|
a.statsWg.Wait()
|
||||||
|
a.wg.Done()
|
||||||
|
|
||||||
|
//a.wg.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewStatsApi(ticker mct.MultiChanTicker, wg *sync.WaitGroup, statsApiConfigfile string) (StatsApi, error) {
|
||||||
|
a := new(statsApi)
|
||||||
|
a.name = "StatsApi"
|
||||||
|
a.config.Host = "localhost"
|
||||||
|
a.config.Port = "8080"
|
||||||
|
configFile, err := os.Open(statsApiConfigfile)
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentError(a.name, err.Error())
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer configFile.Close()
|
||||||
|
jsonParser := json.NewDecoder(configFile)
|
||||||
|
err = jsonParser.Decode(&a.config)
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentError(a.name, err.Error())
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
a.input = make(chan lp.CCMetric)
|
||||||
|
a.ticker = ticker
|
||||||
|
a.tickchan = make(chan time.Time)
|
||||||
|
a.wg = wg
|
||||||
|
a.indone = make(chan bool)
|
||||||
|
a.outdone = make(chan bool)
|
||||||
|
a.router = mux.NewRouter()
|
||||||
|
a.baseurl = fmt.Sprintf("%s:%s", a.config.Host, a.config.Port)
|
||||||
|
a.server = &http.Server{Addr: a.baseurl, Handler: a.router}
|
||||||
|
if a.config.PublishCollectorState {
|
||||||
|
a.router.HandleFunc("/", a.StatsFunc)
|
||||||
|
}
|
||||||
|
statsApiServer = a
|
||||||
|
return a, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func ComponentStatInt(component string, key string, value int64) {
|
||||||
|
if statsApiServer == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
y, err := lp.New("_stats", map[string]string{}, map[string]string{"source": component}, map[string]interface{}{key: value}, time.Now())
|
||||||
|
if err == nil {
|
||||||
|
statsApiServer.input <- y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func ComponentStatString(component string, key string, value int64) {
|
||||||
|
if statsApiServer == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
y, err := lp.New("_stats", map[string]string{}, map[string]string{"source": component}, map[string]interface{}{key: value}, time.Now())
|
||||||
|
if err == nil {
|
||||||
|
statsApiServer.input <- y
|
||||||
|
}
|
||||||
|
}
|
@@ -40,20 +40,26 @@ type metricRouterConfig struct {
|
|||||||
|
|
||||||
// Metric router data structure
|
// Metric router data structure
|
||||||
type metricRouter struct {
|
type metricRouter struct {
|
||||||
hostname string // Hostname used in tags
|
hostname string // Hostname used in tags
|
||||||
coll_input chan lp.CCMetric // Input channel from CollectorManager
|
coll_input chan lp.CCMetric // Input channel from CollectorManager
|
||||||
recv_input chan lp.CCMetric // Input channel from ReceiveManager
|
recv_input chan lp.CCMetric // Input channel from ReceiveManager
|
||||||
cache_input chan lp.CCMetric // Input channel from MetricCache
|
cache_input chan lp.CCMetric // Input channel from MetricCache
|
||||||
outputs []chan lp.CCMetric // List of all output channels
|
outputs []chan lp.CCMetric // List of all output channels
|
||||||
done chan bool // channel to finish / stop metric router
|
done chan bool // channel to finish / stop metric router
|
||||||
wg *sync.WaitGroup // wait group for all goroutines in cc-metric-collector
|
wg *sync.WaitGroup // wait group for all goroutines in cc-metric-collector
|
||||||
timestamp time.Time // timestamp periodically updated by ticker each interval
|
timestamp time.Time // timestamp periodically updated by ticker each interval
|
||||||
timerdone chan bool // channel to finish / stop timestamp updater
|
timerdone chan bool // channel to finish / stop timestamp updater
|
||||||
ticker mct.MultiChanTicker // periodically ticking once each interval
|
ticker mct.MultiChanTicker // periodically ticking once each interval
|
||||||
config metricRouterConfig // json encoded config for metric router
|
config metricRouterConfig // json encoded config for metric router
|
||||||
cache MetricCache // pointer to MetricCache
|
cache MetricCache // pointer to MetricCache
|
||||||
cachewg sync.WaitGroup // wait group for MetricCache
|
cachewg sync.WaitGroup // wait group for MetricCache
|
||||||
maxForward int // number of metrics to forward maximally in one iteration
|
maxForward int // number of metrics to forward maximally in one iteration
|
||||||
|
statsCollForward int64
|
||||||
|
statsRecvForward int64
|
||||||
|
statsCacheForward int64
|
||||||
|
statsTotalForward int64
|
||||||
|
statsDropped int64
|
||||||
|
statsRenamed int64
|
||||||
}
|
}
|
||||||
|
|
||||||
// MetricRouter access functions
|
// MetricRouter access functions
|
||||||
@@ -103,7 +109,10 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout
|
|||||||
cclog.ComponentError("MetricRouter", err.Error())
|
cclog.ComponentError("MetricRouter", err.Error())
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
r.maxForward = r.config.MaxForward
|
r.maxForward = 1
|
||||||
|
if r.config.MaxForward > r.maxForward {
|
||||||
|
r.maxForward = r.config.MaxForward
|
||||||
|
}
|
||||||
if r.config.NumCacheIntervals > 0 {
|
if r.config.NumCacheIntervals > 0 {
|
||||||
r.cache, err = NewCache(r.cache_input, r.ticker, &r.cachewg, r.config.NumCacheIntervals)
|
r.cache, err = NewCache(r.cache_input, r.ticker, &r.cachewg, r.config.NumCacheIntervals)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -118,6 +127,12 @@ func (r *metricRouter) Init(ticker mct.MultiChanTicker, wg *sync.WaitGroup, rout
|
|||||||
for _, mname := range r.config.DropMetrics {
|
for _, mname := range r.config.DropMetrics {
|
||||||
r.config.dropMetrics[mname] = true
|
r.config.dropMetrics[mname] = true
|
||||||
}
|
}
|
||||||
|
r.statsCollForward = 0
|
||||||
|
r.statsRecvForward = 0
|
||||||
|
r.statsCacheForward = 0
|
||||||
|
r.statsTotalForward = 0
|
||||||
|
r.statsDropped = 0
|
||||||
|
r.statsRenamed = 0
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -137,6 +152,7 @@ func (r *metricRouter) StartTimer() {
|
|||||||
cclog.ComponentDebug("MetricRouter", "TIMER DONE")
|
cclog.ComponentDebug("MetricRouter", "TIMER DONE")
|
||||||
return
|
return
|
||||||
case t := <-m:
|
case t := <-m:
|
||||||
|
cclog.ComponentDebug("MetricRouter", "INTERVAL_TICK", t.Unix())
|
||||||
r.timestamp = t
|
r.timestamp = t
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -250,6 +266,8 @@ func (r *metricRouter) Start() {
|
|||||||
r.DoDelTags(point)
|
r.DoDelTags(point)
|
||||||
name := point.Name()
|
name := point.Name()
|
||||||
if new, ok := r.config.RenameMetrics[name]; ok {
|
if new, ok := r.config.RenameMetrics[name]; ok {
|
||||||
|
r.statsRenamed++
|
||||||
|
ComponentStatInt("MetricRouter", "renamed", r.statsRenamed)
|
||||||
point.SetName(new)
|
point.SetName(new)
|
||||||
point.AddMeta("oldname", name)
|
point.AddMeta("oldname", name)
|
||||||
}
|
}
|
||||||
@@ -269,7 +287,14 @@ func (r *metricRouter) Start() {
|
|||||||
p.SetTime(r.timestamp)
|
p.SetTime(r.timestamp)
|
||||||
}
|
}
|
||||||
if !r.dropMetric(p) {
|
if !r.dropMetric(p) {
|
||||||
|
r.statsCollForward++
|
||||||
|
r.statsTotalForward++
|
||||||
|
ComponentStatInt("MetricRouter", "collector_forward", r.statsCollForward)
|
||||||
|
ComponentStatInt("MetricRouter", "total_forward", r.statsTotalForward)
|
||||||
forward(p)
|
forward(p)
|
||||||
|
} else {
|
||||||
|
r.statsDropped++
|
||||||
|
ComponentStatInt("MetricRouter", "dropped", r.statsDropped)
|
||||||
}
|
}
|
||||||
// even if the metric is dropped, it is stored in the cache for
|
// even if the metric is dropped, it is stored in the cache for
|
||||||
// aggregations
|
// aggregations
|
||||||
@@ -285,7 +310,14 @@ func (r *metricRouter) Start() {
|
|||||||
p.SetTime(r.timestamp)
|
p.SetTime(r.timestamp)
|
||||||
}
|
}
|
||||||
if !r.dropMetric(p) {
|
if !r.dropMetric(p) {
|
||||||
|
r.statsRecvForward++
|
||||||
|
r.statsTotalForward++
|
||||||
|
ComponentStatInt("MetricRouter", "receiver_forward", r.statsRecvForward)
|
||||||
|
ComponentStatInt("MetricRouter", "total_forward", r.statsTotalForward)
|
||||||
forward(p)
|
forward(p)
|
||||||
|
} else {
|
||||||
|
r.statsDropped++
|
||||||
|
ComponentStatInt("MetricRouter", "dropped", r.statsDropped)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -294,7 +326,14 @@ func (r *metricRouter) Start() {
|
|||||||
// receive from metric collector
|
// receive from metric collector
|
||||||
if !r.dropMetric(p) {
|
if !r.dropMetric(p) {
|
||||||
p.AddTag(r.config.HostnameTagName, r.hostname)
|
p.AddTag(r.config.HostnameTagName, r.hostname)
|
||||||
|
r.statsCacheForward++
|
||||||
|
r.statsTotalForward++
|
||||||
|
ComponentStatInt("MetricRouter", "cache_forward", r.statsCacheForward)
|
||||||
|
ComponentStatInt("MetricRouter", "total_forward", r.statsTotalForward)
|
||||||
forward(p)
|
forward(p)
|
||||||
|
} else {
|
||||||
|
r.statsDropped++
|
||||||
|
ComponentStatInt("MetricRouter", "dropped", r.statsDropped)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -11,6 +11,7 @@ import (
|
|||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
)
|
)
|
||||||
|
|
||||||
const GMETRIC_EXEC = `gmetric`
|
const GMETRIC_EXEC = `gmetric`
|
||||||
@@ -29,9 +30,10 @@ type GangliaSinkConfig struct {
|
|||||||
|
|
||||||
type GangliaSink struct {
|
type GangliaSink struct {
|
||||||
sink
|
sink
|
||||||
gmetric_path string
|
gmetric_path string
|
||||||
gmetric_config string
|
gmetric_config string
|
||||||
config GangliaSinkConfig
|
config GangliaSinkConfig
|
||||||
|
statsSentMetrics int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *GangliaSink) Write(point lp.CCMetric) error {
|
func (s *GangliaSink) Write(point lp.CCMetric) error {
|
||||||
@@ -78,6 +80,8 @@ func (s *GangliaSink) Write(point lp.CCMetric) error {
|
|||||||
command := exec.Command(s.gmetric_path, argstr...)
|
command := exec.Command(s.gmetric_path, argstr...)
|
||||||
command.Wait()
|
command.Wait()
|
||||||
_, err = command.Output()
|
_, err = command.Output()
|
||||||
|
s.statsSentMetrics++
|
||||||
|
stats.ComponentStatInt(s.name, "sent_metrics", s.statsSentMetrics)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -120,5 +124,6 @@ func NewGangliaSink(name string, config json.RawMessage) (Sink, error) {
|
|||||||
if len(s.config.GmetricConfig) > 0 {
|
if len(s.config.GmetricConfig) > 0 {
|
||||||
s.gmetric_config = s.config.GmetricConfig
|
s.gmetric_config = s.config.GmetricConfig
|
||||||
}
|
}
|
||||||
|
s.statsSentMetrics = 0
|
||||||
return s, nil
|
return s, nil
|
||||||
}
|
}
|
||||||
|
@@ -11,6 +11,7 @@ import (
|
|||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
influx "github.com/influxdata/line-protocol"
|
influx "github.com/influxdata/line-protocol"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -36,19 +37,21 @@ type HttpSink struct {
|
|||||||
idleConnTimeout time.Duration
|
idleConnTimeout time.Duration
|
||||||
timeout time.Duration
|
timeout time.Duration
|
||||||
flushDelay time.Duration
|
flushDelay time.Duration
|
||||||
|
statsProcessed int64
|
||||||
|
statsFlushes int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *HttpSink) Write(m lp.CCMetric) error {
|
func (s *HttpSink) Write(m lp.CCMetric) error {
|
||||||
if s.buffer.Len() == 0 && s.flushDelay != 0 {
|
if s.buffer.Len() == 0 && s.flushDelay != 0 {
|
||||||
// This is the first write since the last flush, start the flushTimer!
|
// This is the first write since the last flush, start the flushTimer!
|
||||||
if s.flushTimer != nil && s.flushTimer.Stop() {
|
if s.flushTimer != nil && s.flushTimer.Stop() {
|
||||||
cclog.ComponentDebug("HttpSink", "unexpected: the flushTimer was already running?")
|
cclog.ComponentDebug(s.name, "unexpected: the flushTimer was already running?")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Run a batched flush for all lines that have arrived in the last second
|
// Run a batched flush for all lines that have arrived in the last second
|
||||||
s.flushTimer = time.AfterFunc(s.flushDelay, func() {
|
s.flushTimer = time.AfterFunc(s.flushDelay, func() {
|
||||||
if err := s.Flush(); err != nil {
|
if err := s.Flush(); err != nil {
|
||||||
cclog.ComponentError("HttpSink", "flush failed:", err.Error())
|
cclog.ComponentError(s.name, "flush failed:", err.Error())
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -60,8 +63,11 @@ func (s *HttpSink) Write(m lp.CCMetric) error {
|
|||||||
s.lock.Unlock() // defer does not work here as Flush() takes the lock as well
|
s.lock.Unlock() // defer does not work here as Flush() takes the lock as well
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
cclog.ComponentError(s.name, "encoding failed:", err.Error())
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
s.statsProcessed++
|
||||||
|
stats.ComponentStatInt(s.name, "processed_metrics", s.statsProcessed)
|
||||||
|
|
||||||
// Flush synchronously if "flush_delay" is zero
|
// Flush synchronously if "flush_delay" is zero
|
||||||
if s.flushDelay == 0 {
|
if s.flushDelay == 0 {
|
||||||
@@ -84,6 +90,7 @@ func (s *HttpSink) Flush() error {
|
|||||||
// Create new request to send buffer
|
// Create new request to send buffer
|
||||||
req, err := http.NewRequest(http.MethodPost, s.config.URL, s.buffer)
|
req, err := http.NewRequest(http.MethodPost, s.config.URL, s.buffer)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
cclog.ComponentError(s.name, "failed to create request:", err.Error())
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -100,13 +107,18 @@ func (s *HttpSink) Flush() error {
|
|||||||
|
|
||||||
// Handle transport/tcp errors
|
// Handle transport/tcp errors
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
cclog.ComponentError(s.name, "transport/tcp error:", err.Error())
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Handle application errors
|
// Handle application errors
|
||||||
if res.StatusCode != http.StatusOK {
|
if res.StatusCode != http.StatusOK {
|
||||||
return errors.New(res.Status)
|
err = errors.New(res.Status)
|
||||||
|
cclog.ComponentError(s.name, "application error:", err.Error())
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
s.statsFlushes++
|
||||||
|
stats.ComponentStatInt(s.name, "flushes", s.statsFlushes)
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -114,7 +126,7 @@ func (s *HttpSink) Flush() error {
|
|||||||
func (s *HttpSink) Close() {
|
func (s *HttpSink) Close() {
|
||||||
s.flushTimer.Stop()
|
s.flushTimer.Stop()
|
||||||
if err := s.Flush(); err != nil {
|
if err := s.Flush(); err != nil {
|
||||||
cclog.ComponentError("HttpSink", "flush failed:", err.Error())
|
cclog.ComponentError(s.name, "flush failed:", err.Error())
|
||||||
}
|
}
|
||||||
s.client.CloseIdleConnections()
|
s.client.CloseIdleConnections()
|
||||||
}
|
}
|
||||||
@@ -172,5 +184,7 @@ func NewHttpSink(name string, config json.RawMessage) (Sink, error) {
|
|||||||
s.buffer = &bytes.Buffer{}
|
s.buffer = &bytes.Buffer{}
|
||||||
s.encoder = influx.NewEncoder(s.buffer)
|
s.encoder = influx.NewEncoder(s.buffer)
|
||||||
s.encoder.SetPrecision(time.Second)
|
s.encoder.SetPrecision(time.Second)
|
||||||
|
s.statsFlushes = 0
|
||||||
|
s.statsProcessed = 0
|
||||||
return s, nil
|
return s, nil
|
||||||
}
|
}
|
||||||
|
@@ -10,6 +10,7 @@ import (
|
|||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
influxdb2 "github.com/influxdata/influxdb-client-go/v2"
|
influxdb2 "github.com/influxdata/influxdb-client-go/v2"
|
||||||
influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api"
|
influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api"
|
||||||
)
|
)
|
||||||
@@ -28,10 +29,10 @@ type InfluxAsyncSinkConfig struct {
|
|||||||
BatchSize uint `json:"batch_size,omitempty"`
|
BatchSize uint `json:"batch_size,omitempty"`
|
||||||
// Interval, in ms, in which is buffer flushed if it has not been already written (by reaching batch size) . Default 1000ms
|
// Interval, in ms, in which is buffer flushed if it has not been already written (by reaching batch size) . Default 1000ms
|
||||||
FlushInterval uint `json:"flush_interval,omitempty"`
|
FlushInterval uint `json:"flush_interval,omitempty"`
|
||||||
InfluxRetryInterval string `json:"retry_interval"`
|
InfluxRetryInterval string `json:"retry_interval,omitempty"`
|
||||||
InfluxExponentialBase uint `json:"retry_exponential_base"`
|
InfluxExponentialBase uint `json:"retry_exponential_base,omitempty"`
|
||||||
InfluxMaxRetries uint `json:"max_retries"`
|
InfluxMaxRetries uint `json:"max_retries,omitempty"`
|
||||||
InfluxMaxRetryTime string `json:"max_retry_time"`
|
InfluxMaxRetryTime string `json:"max_retry_time,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type InfluxAsyncSink struct {
|
type InfluxAsyncSink struct {
|
||||||
@@ -42,6 +43,9 @@ type InfluxAsyncSink struct {
|
|||||||
config InfluxAsyncSinkConfig
|
config InfluxAsyncSinkConfig
|
||||||
influxRetryInterval uint
|
influxRetryInterval uint
|
||||||
influxMaxRetryTime uint
|
influxMaxRetryTime uint
|
||||||
|
sentMetrics int64
|
||||||
|
statsFlushes int64
|
||||||
|
statsErrors int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *InfluxAsyncSink) connect() error {
|
func (s *InfluxAsyncSink) connect() error {
|
||||||
@@ -60,20 +64,34 @@ func (s *InfluxAsyncSink) connect() error {
|
|||||||
cclog.ComponentDebug(s.name, "Using URI", uri, "Org", s.config.Organization, "Bucket", s.config.Database)
|
cclog.ComponentDebug(s.name, "Using URI", uri, "Org", s.config.Organization, "Bucket", s.config.Database)
|
||||||
clientOptions := influxdb2.DefaultOptions()
|
clientOptions := influxdb2.DefaultOptions()
|
||||||
if s.config.BatchSize != 0 {
|
if s.config.BatchSize != 0 {
|
||||||
|
cclog.ComponentDebug(s.name, "Batch size", s.config.BatchSize)
|
||||||
clientOptions.SetBatchSize(s.config.BatchSize)
|
clientOptions.SetBatchSize(s.config.BatchSize)
|
||||||
}
|
}
|
||||||
if s.config.FlushInterval != 0 {
|
if s.config.FlushInterval != 0 {
|
||||||
|
cclog.ComponentDebug(s.name, "Flush interval", s.config.FlushInterval)
|
||||||
clientOptions.SetFlushInterval(s.config.FlushInterval)
|
clientOptions.SetFlushInterval(s.config.FlushInterval)
|
||||||
}
|
}
|
||||||
|
if s.influxRetryInterval != 0 {
|
||||||
|
cclog.ComponentDebug(s.name, "MaxRetryInterval", s.influxRetryInterval)
|
||||||
|
clientOptions.SetMaxRetryInterval(s.influxRetryInterval)
|
||||||
|
}
|
||||||
|
if s.influxMaxRetryTime != 0 {
|
||||||
|
cclog.ComponentDebug(s.name, "MaxRetryTime", s.influxMaxRetryTime)
|
||||||
|
clientOptions.SetMaxRetryTime(s.influxMaxRetryTime)
|
||||||
|
}
|
||||||
|
if s.config.InfluxExponentialBase != 0 {
|
||||||
|
cclog.ComponentDebug(s.name, "Exponential Base", s.config.InfluxExponentialBase)
|
||||||
|
clientOptions.SetExponentialBase(s.config.InfluxExponentialBase)
|
||||||
|
}
|
||||||
|
if s.config.InfluxMaxRetries != 0 {
|
||||||
|
cclog.ComponentDebug(s.name, "Max Retries", s.config.InfluxMaxRetries)
|
||||||
|
clientOptions.SetMaxRetries(s.config.InfluxMaxRetries)
|
||||||
|
}
|
||||||
clientOptions.SetTLSConfig(
|
clientOptions.SetTLSConfig(
|
||||||
&tls.Config{
|
&tls.Config{
|
||||||
InsecureSkipVerify: true,
|
InsecureSkipVerify: true,
|
||||||
},
|
},
|
||||||
)
|
).SetPrecision(time.Second)
|
||||||
clientOptions.SetMaxRetryInterval(s.influxRetryInterval)
|
|
||||||
clientOptions.SetMaxRetryTime(s.influxMaxRetryTime)
|
|
||||||
clientOptions.SetExponentialBase(s.config.InfluxExponentialBase)
|
|
||||||
clientOptions.SetMaxRetries(s.config.InfluxMaxRetries)
|
|
||||||
|
|
||||||
s.client = influxdb2.NewClientWithOptions(uri, auth, clientOptions)
|
s.client = influxdb2.NewClientWithOptions(uri, auth, clientOptions)
|
||||||
s.writeApi = s.client.WriteAPI(s.config.Organization, s.config.Database)
|
s.writeApi = s.client.WriteAPI(s.config.Organization, s.config.Database)
|
||||||
@@ -91,11 +109,15 @@ func (s *InfluxAsyncSink) Write(m lp.CCMetric) error {
|
|||||||
s.writeApi.WritePoint(
|
s.writeApi.WritePoint(
|
||||||
m.ToPoint(s.meta_as_tags),
|
m.ToPoint(s.meta_as_tags),
|
||||||
)
|
)
|
||||||
|
s.sentMetrics++
|
||||||
|
stats.ComponentStatInt(s.name, "send_metrics", s.sentMetrics)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *InfluxAsyncSink) Flush() error {
|
func (s *InfluxAsyncSink) Flush() error {
|
||||||
s.writeApi.Flush()
|
s.writeApi.Flush()
|
||||||
|
s.statsFlushes++
|
||||||
|
stats.ComponentStatInt(s.name, "flushes", s.statsFlushes)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -110,13 +132,14 @@ func NewInfluxAsyncSink(name string, config json.RawMessage) (Sink, error) {
|
|||||||
s.name = fmt.Sprintf("InfluxSink(%s)", name)
|
s.name = fmt.Sprintf("InfluxSink(%s)", name)
|
||||||
|
|
||||||
// Set default for maximum number of points sent to server in single request.
|
// Set default for maximum number of points sent to server in single request.
|
||||||
s.config.BatchSize = 100
|
s.config.BatchSize = 0
|
||||||
s.influxRetryInterval = uint(time.Duration(1) * time.Second)
|
s.influxRetryInterval = 0
|
||||||
s.config.InfluxRetryInterval = "1s"
|
//s.config.InfluxRetryInterval = "1s"
|
||||||
s.influxMaxRetryTime = uint(7 * time.Duration(24) * time.Hour)
|
s.influxMaxRetryTime = 0
|
||||||
s.config.InfluxMaxRetryTime = "168h"
|
//s.config.InfluxMaxRetryTime = "168h"
|
||||||
s.config.InfluxMaxRetries = 20
|
s.config.InfluxMaxRetries = 0
|
||||||
s.config.InfluxExponentialBase = 2
|
s.config.InfluxExponentialBase = 0
|
||||||
|
s.config.FlushInterval = 0
|
||||||
|
|
||||||
// Default retry intervals (in seconds)
|
// Default retry intervals (in seconds)
|
||||||
// 1 2
|
// 1 2
|
||||||
@@ -174,12 +197,17 @@ func NewInfluxAsyncSink(name string, config json.RawMessage) (Sink, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Start background: Read from error channel
|
// Start background: Read from error channel
|
||||||
|
s.statsErrors = 0
|
||||||
s.errors = s.writeApi.Errors()
|
s.errors = s.writeApi.Errors()
|
||||||
go func() {
|
go func() {
|
||||||
for err := range s.errors {
|
for err := range s.errors {
|
||||||
|
s.statsErrors++
|
||||||
|
stats.ComponentStatInt(s.name, "errors", s.statsErrors)
|
||||||
cclog.ComponentError(s.name, err.Error())
|
cclog.ComponentError(s.name, err.Error())
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
s.sentMetrics = 0
|
||||||
|
s.statsFlushes = 0
|
||||||
return s, nil
|
return s, nil
|
||||||
}
|
}
|
||||||
|
@@ -6,38 +6,49 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
influxdb2 "github.com/influxdata/influxdb-client-go/v2"
|
influxdb2 "github.com/influxdata/influxdb-client-go/v2"
|
||||||
influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api"
|
influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api"
|
||||||
|
"github.com/influxdata/influxdb-client-go/v2/api/write"
|
||||||
)
|
)
|
||||||
|
|
||||||
type InfluxSinkConfig struct {
|
type InfluxSinkConfig struct {
|
||||||
defaultSinkConfig
|
defaultSinkConfig
|
||||||
Host string `json:"host,omitempty"`
|
Host string `json:"host,omitempty"`
|
||||||
Port string `json:"port,omitempty"`
|
Port string `json:"port,omitempty"`
|
||||||
Database string `json:"database,omitempty"`
|
Database string `json:"database,omitempty"`
|
||||||
User string `json:"user,omitempty"`
|
User string `json:"user,omitempty"`
|
||||||
Password string `json:"password,omitempty"`
|
Password string `json:"password,omitempty"`
|
||||||
Organization string `json:"organization,omitempty"`
|
Organization string `json:"organization,omitempty"`
|
||||||
SSL bool `json:"ssl,omitempty"`
|
SSL bool `json:"ssl,omitempty"`
|
||||||
RetentionPol string `json:"retention_policy,omitempty"`
|
FlushDelay string `json:"flush_delay,omitempty"`
|
||||||
InfluxRetryInterval string `json:"retry_interval"`
|
BatchSize int `json:"batch_size,omitempty"`
|
||||||
InfluxExponentialBase uint `json:"retry_exponential_base"`
|
RetentionPol string `json:"retention_policy,omitempty"`
|
||||||
InfluxMaxRetries uint `json:"max_retries"`
|
// InfluxRetryInterval string `json:"retry_interval"`
|
||||||
InfluxMaxRetryTime string `json:"max_retry_time"`
|
// InfluxExponentialBase uint `json:"retry_exponential_base"`
|
||||||
|
// InfluxMaxRetries uint `json:"max_retries"`
|
||||||
|
// InfluxMaxRetryTime string `json:"max_retry_time"`
|
||||||
//InfluxMaxRetryDelay string `json:"max_retry_delay"` // It is mentioned in the docs but there is no way to set it
|
//InfluxMaxRetryDelay string `json:"max_retry_delay"` // It is mentioned in the docs but there is no way to set it
|
||||||
}
|
}
|
||||||
|
|
||||||
type InfluxSink struct {
|
type InfluxSink struct {
|
||||||
sink
|
sink
|
||||||
client influxdb2.Client
|
client influxdb2.Client
|
||||||
writeApi influxdb2Api.WriteAPIBlocking
|
writeApi influxdb2Api.WriteAPIBlocking
|
||||||
config InfluxSinkConfig
|
config InfluxSinkConfig
|
||||||
influxRetryInterval uint
|
influxRetryInterval uint
|
||||||
influxMaxRetryTime uint
|
influxMaxRetryTime uint
|
||||||
|
batch []*write.Point
|
||||||
|
flushTimer *time.Timer
|
||||||
|
flushDelay time.Duration
|
||||||
|
lock sync.Mutex // Flush() runs in another goroutine, so this lock has to protect the buffer
|
||||||
|
statsSentMetrics int64
|
||||||
|
statsProcessedMetrics int64
|
||||||
//influxMaxRetryDelay uint
|
//influxMaxRetryDelay uint
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -56,16 +67,31 @@ func (s *InfluxSink) connect() error {
|
|||||||
}
|
}
|
||||||
cclog.ComponentDebug(s.name, "Using URI", uri, "Org", s.config.Organization, "Bucket", s.config.Database)
|
cclog.ComponentDebug(s.name, "Using URI", uri, "Org", s.config.Organization, "Bucket", s.config.Database)
|
||||||
clientOptions := influxdb2.DefaultOptions()
|
clientOptions := influxdb2.DefaultOptions()
|
||||||
|
|
||||||
|
// if s.influxRetryInterval != 0 {
|
||||||
|
// cclog.ComponentDebug(s.name, "MaxRetryInterval", s.influxRetryInterval)
|
||||||
|
// clientOptions.SetMaxRetryInterval(s.influxRetryInterval)
|
||||||
|
// }
|
||||||
|
// if s.influxMaxRetryTime != 0 {
|
||||||
|
// cclog.ComponentDebug(s.name, "MaxRetryTime", s.influxMaxRetryTime)
|
||||||
|
// clientOptions.SetMaxRetryTime(s.influxMaxRetryTime)
|
||||||
|
// }
|
||||||
|
// if s.config.InfluxExponentialBase != 0 {
|
||||||
|
// cclog.ComponentDebug(s.name, "Exponential Base", s.config.InfluxExponentialBase)
|
||||||
|
// clientOptions.SetExponentialBase(s.config.InfluxExponentialBase)
|
||||||
|
// }
|
||||||
|
// if s.config.InfluxMaxRetries != 0 {
|
||||||
|
// cclog.ComponentDebug(s.name, "Max Retries", s.config.InfluxMaxRetries)
|
||||||
|
// clientOptions.SetMaxRetries(s.config.InfluxMaxRetries)
|
||||||
|
// }
|
||||||
|
|
||||||
clientOptions.SetTLSConfig(
|
clientOptions.SetTLSConfig(
|
||||||
&tls.Config{
|
&tls.Config{
|
||||||
InsecureSkipVerify: true,
|
InsecureSkipVerify: true,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
clientOptions.SetMaxRetryInterval(s.influxRetryInterval)
|
clientOptions.SetPrecision(time.Second)
|
||||||
clientOptions.SetMaxRetryTime(s.influxMaxRetryTime)
|
|
||||||
clientOptions.SetExponentialBase(s.config.InfluxExponentialBase)
|
|
||||||
clientOptions.SetMaxRetries(s.config.InfluxMaxRetries)
|
|
||||||
|
|
||||||
s.client = influxdb2.NewClientWithOptions(uri, auth, clientOptions)
|
s.client = influxdb2.NewClientWithOptions(uri, auth, clientOptions)
|
||||||
s.writeApi = s.client.WriteAPIBlocking(s.config.Organization, s.config.Database)
|
s.writeApi = s.client.WriteAPIBlocking(s.config.Organization, s.config.Database)
|
||||||
@@ -80,38 +106,80 @@ func (s *InfluxSink) connect() error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *InfluxSink) Write(m lp.CCMetric) error {
|
func (s *InfluxSink) Write(m lp.CCMetric) error {
|
||||||
err :=
|
// err :=
|
||||||
s.writeApi.WritePoint(
|
// s.writeApi.WritePoint(
|
||||||
context.Background(),
|
// context.Background(),
|
||||||
m.ToPoint(s.meta_as_tags),
|
// m.ToPoint(s.meta_as_tags),
|
||||||
)
|
// )
|
||||||
return err
|
if len(s.batch) == 0 && s.flushDelay != 0 {
|
||||||
|
// This is the first write since the last flush, start the flushTimer!
|
||||||
|
if s.flushTimer != nil && s.flushTimer.Stop() {
|
||||||
|
cclog.ComponentDebug(s.name, "unexpected: the flushTimer was already running?")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run a batched flush for all lines that have arrived in the last second
|
||||||
|
s.flushTimer = time.AfterFunc(s.flushDelay, func() {
|
||||||
|
if err := s.Flush(); err != nil {
|
||||||
|
cclog.ComponentError(s.name, "flush failed:", err.Error())
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
p := m.ToPoint(s.meta_as_tags)
|
||||||
|
s.lock.Lock()
|
||||||
|
s.statsProcessedMetrics++
|
||||||
|
s.batch = append(s.batch, p)
|
||||||
|
s.lock.Unlock()
|
||||||
|
stats.ComponentStatInt(s.name, "processed_metrics", s.statsProcessedMetrics)
|
||||||
|
|
||||||
|
// Flush synchronously if "flush_delay" is zero
|
||||||
|
if s.flushDelay == 0 {
|
||||||
|
return s.Flush()
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *InfluxSink) Flush() error {
|
func (s *InfluxSink) Flush() error {
|
||||||
|
s.lock.Lock()
|
||||||
|
defer s.lock.Unlock()
|
||||||
|
if len(s.batch) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
err := s.writeApi.WritePoint(context.Background(), s.batch...)
|
||||||
|
if err != nil {
|
||||||
|
cclog.ComponentError(s.name, "flush failed:", err.Error())
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
s.statsSentMetrics += int64(len(s.batch))
|
||||||
|
stats.ComponentStatInt(s.name, "sent_metrics", s.statsSentMetrics)
|
||||||
|
s.batch = s.batch[:0]
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *InfluxSink) Close() {
|
func (s *InfluxSink) Close() {
|
||||||
cclog.ComponentDebug(s.name, "Closing InfluxDB connection")
|
cclog.ComponentDebug(s.name, "Closing InfluxDB connection")
|
||||||
|
s.flushTimer.Stop()
|
||||||
|
s.Flush()
|
||||||
s.client.Close()
|
s.client.Close()
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewInfluxSink(name string, config json.RawMessage) (Sink, error) {
|
func NewInfluxSink(name string, config json.RawMessage) (Sink, error) {
|
||||||
s := new(InfluxSink)
|
s := new(InfluxSink)
|
||||||
s.name = fmt.Sprintf("InfluxSink(%s)", name)
|
s.name = fmt.Sprintf("InfluxSink(%s)", name)
|
||||||
|
s.config.BatchSize = 100
|
||||||
|
s.config.FlushDelay = "1s"
|
||||||
if len(config) > 0 {
|
if len(config) > 0 {
|
||||||
err := json.Unmarshal(config, &s.config)
|
err := json.Unmarshal(config, &s.config)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
s.influxRetryInterval = uint(time.Duration(1) * time.Second)
|
s.influxRetryInterval = 0
|
||||||
s.config.InfluxRetryInterval = "1s"
|
s.influxMaxRetryTime = 0
|
||||||
s.influxMaxRetryTime = uint(7 * time.Duration(24) * time.Hour)
|
// s.config.InfluxRetryInterval = ""
|
||||||
s.config.InfluxMaxRetryTime = "168h"
|
// s.config.InfluxMaxRetryTime = ""
|
||||||
s.config.InfluxMaxRetries = 20
|
// s.config.InfluxMaxRetries = 0
|
||||||
s.config.InfluxExponentialBase = 2
|
// s.config.InfluxExponentialBase = 0
|
||||||
|
|
||||||
if len(s.config.Host) == 0 ||
|
if len(s.config.Host) == 0 ||
|
||||||
len(s.config.Port) == 0 ||
|
len(s.config.Port) == 0 ||
|
||||||
@@ -126,19 +194,31 @@ func NewInfluxSink(name string, config json.RawMessage) (Sink, error) {
|
|||||||
s.meta_as_tags[k] = true
|
s.meta_as_tags[k] = true
|
||||||
}
|
}
|
||||||
|
|
||||||
toUint := func(duration string, def uint) uint {
|
// toUint := func(duration string, def uint) uint {
|
||||||
t, err := time.ParseDuration(duration)
|
// if len(duration) > 0 {
|
||||||
|
// t, err := time.ParseDuration(duration)
|
||||||
|
// if err == nil {
|
||||||
|
// return uint(t.Milliseconds())
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// return def
|
||||||
|
// }
|
||||||
|
// s.influxRetryInterval = toUint(s.config.InfluxRetryInterval, s.influxRetryInterval)
|
||||||
|
// s.influxMaxRetryTime = toUint(s.config.InfluxMaxRetryTime, s.influxMaxRetryTime)
|
||||||
|
|
||||||
|
if len(s.config.FlushDelay) > 0 {
|
||||||
|
t, err := time.ParseDuration(s.config.FlushDelay)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
return uint(t.Milliseconds())
|
s.flushDelay = t
|
||||||
}
|
}
|
||||||
return def
|
|
||||||
}
|
}
|
||||||
s.influxRetryInterval = toUint(s.config.InfluxRetryInterval, s.influxRetryInterval)
|
s.batch = make([]*write.Point, 0, s.config.BatchSize)
|
||||||
s.influxMaxRetryTime = toUint(s.config.InfluxMaxRetryTime, s.influxMaxRetryTime)
|
|
||||||
|
|
||||||
// Connect to InfluxDB server
|
// Connect to InfluxDB server
|
||||||
if err := s.connect(); err != nil {
|
if err := s.connect(); err != nil {
|
||||||
return nil, fmt.Errorf("unable to connect: %v", err)
|
return nil, fmt.Errorf("unable to connect: %v", err)
|
||||||
}
|
}
|
||||||
|
s.statsSentMetrics = 0
|
||||||
|
s.statsProcessedMetrics = 0
|
||||||
return s, nil
|
return s, nil
|
||||||
}
|
}
|
||||||
|
@@ -17,10 +17,8 @@ The `influxdb` sink uses the official [InfluxDB golang client](https://pkg.go.de
|
|||||||
"password" : "examplepw",
|
"password" : "examplepw",
|
||||||
"organization": "myorg",
|
"organization": "myorg",
|
||||||
"ssl": true,
|
"ssl": true,
|
||||||
"retry_interval" : "1s",
|
"flush_delay" : "1s",
|
||||||
"retry_exponential_base" : 2,
|
"batch_size" : 100
|
||||||
"max_retries": 20,
|
|
||||||
"max_retry_time" : "168h"
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
@@ -34,9 +32,6 @@ The `influxdb` sink uses the official [InfluxDB golang client](https://pkg.go.de
|
|||||||
- `password`: Password for basic authentification
|
- `password`: Password for basic authentification
|
||||||
- `organization`: Organization in the InfluxDB
|
- `organization`: Organization in the InfluxDB
|
||||||
- `ssl`: Use SSL connection
|
- `ssl`: Use SSL connection
|
||||||
- `retry_interval`: Base retry interval for failed write requests, default 1s
|
- `flush_delay`: Group metrics coming in to a single batch
|
||||||
- `retry_exponential_base`: The retry interval is exponentially increased with this base, default 2
|
- `batch_size`: Maximal batch size
|
||||||
- `max_retries`: Maximal number of retry attempts
|
|
||||||
- `max_retry_time`: Maximal time to retry failed writes, default 168h (one week)
|
|
||||||
|
|
||||||
For information about the calculation of the retry interval settings, see [offical influxdb-client-go documentation](https://github.com/influxdata/influxdb-client-go#handling-of-failed-async-writes)
|
|
@@ -73,6 +73,7 @@ import (
|
|||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
"github.com/NVIDIA/go-nvml/pkg/dl"
|
"github.com/NVIDIA/go-nvml/pkg/dl"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -102,11 +103,12 @@ type LibgangliaSinkConfig struct {
|
|||||||
|
|
||||||
type LibgangliaSink struct {
|
type LibgangliaSink struct {
|
||||||
sink
|
sink
|
||||||
config LibgangliaSinkConfig
|
config LibgangliaSinkConfig
|
||||||
global_context C.Ganglia_pool
|
global_context C.Ganglia_pool
|
||||||
gmond_config C.Ganglia_gmond_config
|
gmond_config C.Ganglia_gmond_config
|
||||||
send_channels C.Ganglia_udp_send_channels
|
send_channels C.Ganglia_udp_send_channels
|
||||||
cstrCache map[string]*C.char
|
cstrCache map[string]*C.char
|
||||||
|
statsSentMetrics int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *LibgangliaSink) Write(point lp.CCMetric) error {
|
func (s *LibgangliaSink) Write(point lp.CCMetric) error {
|
||||||
@@ -202,6 +204,8 @@ func (s *LibgangliaSink) Write(point lp.CCMetric) error {
|
|||||||
C.Ganglia_metric_destroy(gmetric)
|
C.Ganglia_metric_destroy(gmetric)
|
||||||
// Free the value C string, the only one not stored in the cache
|
// Free the value C string, the only one not stored in the cache
|
||||||
C.free(unsafe.Pointer(c_value))
|
C.free(unsafe.Pointer(c_value))
|
||||||
|
s.statsSentMetrics++
|
||||||
|
stats.ComponentStatInt(s.name, "sent_metrics", s.statsSentMetrics)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -247,7 +251,7 @@ func NewLibgangliaSink(name string, config json.RawMessage) (Sink, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("error opening %s: %v", s.config.GangliaLib, err)
|
return nil, fmt.Errorf("error opening %s: %v", s.config.GangliaLib, err)
|
||||||
}
|
}
|
||||||
|
s.statsSentMetrics = 0
|
||||||
// Set up cache for the C strings
|
// Set up cache for the C strings
|
||||||
s.cstrCache = make(map[string]*C.char)
|
s.cstrCache = make(map[string]*C.char)
|
||||||
// s.cstrCache["globals"] = C.CString("globals")
|
// s.cstrCache["globals"] = C.CString("globals")
|
||||||
|
@@ -11,6 +11,7 @@ import (
|
|||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
"github.com/gorilla/mux"
|
"github.com/gorilla/mux"
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||||
@@ -29,11 +30,12 @@ type PrometheusSinkConfig struct {
|
|||||||
|
|
||||||
type PrometheusSink struct {
|
type PrometheusSink struct {
|
||||||
sink
|
sink
|
||||||
config PrometheusSinkConfig
|
config PrometheusSinkConfig
|
||||||
labelMetrics map[string]*prometheus.GaugeVec
|
labelMetrics map[string]*prometheus.GaugeVec
|
||||||
nodeMetrics map[string]prometheus.Gauge
|
nodeMetrics map[string]prometheus.Gauge
|
||||||
promWg sync.WaitGroup
|
promWg sync.WaitGroup
|
||||||
promServer *http.Server
|
promServer *http.Server
|
||||||
|
statsSentMetrics int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func intToFloat64(input interface{}) (float64, error) {
|
func intToFloat64(input interface{}) (float64, error) {
|
||||||
@@ -113,6 +115,8 @@ func (s *PrometheusSink) newMetric(metric lp.CCMetric) error {
|
|||||||
s.nodeMetrics[name] = new
|
s.nodeMetrics[name] = new
|
||||||
prometheus.Register(new)
|
prometheus.Register(new)
|
||||||
}
|
}
|
||||||
|
s.statsSentMetrics++
|
||||||
|
stats.ComponentStatInt(s.name, "sent_metrics", s.statsSentMetrics)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -146,6 +150,8 @@ func (s *PrometheusSink) updateMetric(metric lp.CCMetric) error {
|
|||||||
}
|
}
|
||||||
s.nodeMetrics[name].Set(value)
|
s.nodeMetrics[name].Set(value)
|
||||||
}
|
}
|
||||||
|
s.statsSentMetrics++
|
||||||
|
stats.ComponentStatInt(s.name, "sent_metrics", s.statsSentMetrics)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -7,6 +7,7 @@ import (
|
|||||||
|
|
||||||
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
)
|
)
|
||||||
|
|
||||||
type SampleSinkConfig struct {
|
type SampleSinkConfig struct {
|
||||||
@@ -14,12 +15,15 @@ type SampleSinkConfig struct {
|
|||||||
// See: metricSink.go
|
// See: metricSink.go
|
||||||
defaultSinkConfig
|
defaultSinkConfig
|
||||||
// Additional config options, for SampleSink
|
// Additional config options, for SampleSink
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type SampleSink struct {
|
type SampleSink struct {
|
||||||
// declares elements 'name' and 'meta_as_tags' (string to bool map!)
|
// declares elements 'name' and 'meta_as_tags' (string to bool map!)
|
||||||
sink
|
sink
|
||||||
config SampleSinkConfig // entry point to the SampleSinkConfig
|
config SampleSinkConfig // entry point to the SampleSinkConfig
|
||||||
|
// Stats counters
|
||||||
|
statsSentMetrics int64
|
||||||
}
|
}
|
||||||
|
|
||||||
// Implement functions required for Sink interface
|
// Implement functions required for Sink interface
|
||||||
@@ -30,6 +34,8 @@ type SampleSink struct {
|
|||||||
func (s *SampleSink) Write(point lp.CCMetric) error {
|
func (s *SampleSink) Write(point lp.CCMetric) error {
|
||||||
// based on s.meta_as_tags use meta infos as tags
|
// based on s.meta_as_tags use meta infos as tags
|
||||||
log.Print(point)
|
log.Print(point)
|
||||||
|
s.statsSentMetrics++
|
||||||
|
stats.ComponentStatInt(s.name, "sent_metrics", s.statsSentMetrics)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -63,6 +69,9 @@ func NewSampleSink(name string, config json.RawMessage) (Sink, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Initalize stats counters
|
||||||
|
s.statsSentMetrics = 0
|
||||||
|
|
||||||
// Create lookup map to use meta infos as tags in the output metric
|
// Create lookup map to use meta infos as tags in the output metric
|
||||||
s.meta_as_tags = make(map[string]bool)
|
s.meta_as_tags = make(map[string]bool)
|
||||||
for _, k := range s.config.MetaAsTags {
|
for _, k := range s.config.MetaAsTags {
|
||||||
|
@@ -102,13 +102,19 @@ func (sm *sinkManager) Start() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
toTheSinks := func(p lp.CCMetric) {
|
toTheSinks := func(p lp.CCMetric) {
|
||||||
|
var wg sync.WaitGroup
|
||||||
// Send received metric to all outputs
|
// Send received metric to all outputs
|
||||||
cclog.ComponentDebug("SinkManager", "WRITE", p)
|
cclog.ComponentDebug("SinkManager", "WRITE", p)
|
||||||
for _, s := range sm.sinks {
|
for _, s := range sm.sinks {
|
||||||
if err := s.Write(p); err != nil {
|
wg.Add(1)
|
||||||
cclog.ComponentError("SinkManager", "WRITE", s.Name(), "write failed:", err.Error())
|
go func(s Sink) {
|
||||||
}
|
if err := s.Write(p); err != nil {
|
||||||
|
cclog.ComponentError("SinkManager", "WRITE", s.Name(), "write failed:", err.Error())
|
||||||
|
}
|
||||||
|
wg.Done()
|
||||||
|
}(s)
|
||||||
}
|
}
|
||||||
|
wg.Wait()
|
||||||
}
|
}
|
||||||
|
|
||||||
for {
|
for {
|
||||||
|
@@ -8,6 +8,7 @@ import (
|
|||||||
|
|
||||||
// "time"
|
// "time"
|
||||||
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
||||||
|
stats "github.com/ClusterCockpit/cc-metric-collector/internal/metricRouter"
|
||||||
)
|
)
|
||||||
|
|
||||||
type StdoutSink struct {
|
type StdoutSink struct {
|
||||||
@@ -17,6 +18,7 @@ type StdoutSink struct {
|
|||||||
defaultSinkConfig
|
defaultSinkConfig
|
||||||
Output string `json:"output_file,omitempty"`
|
Output string `json:"output_file,omitempty"`
|
||||||
}
|
}
|
||||||
|
sentMetrics int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *StdoutSink) Write(m lp.CCMetric) error {
|
func (s *StdoutSink) Write(m lp.CCMetric) error {
|
||||||
@@ -24,6 +26,8 @@ func (s *StdoutSink) Write(m lp.CCMetric) error {
|
|||||||
s.output,
|
s.output,
|
||||||
m.ToLineProtocol(s.meta_as_tags),
|
m.ToLineProtocol(s.meta_as_tags),
|
||||||
)
|
)
|
||||||
|
s.sentMetrics++
|
||||||
|
stats.ComponentStatInt(s.name, "sent_metrics", s.sentMetrics)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -68,6 +72,7 @@ func NewStdoutSink(name string, config json.RawMessage) (Sink, error) {
|
|||||||
for _, k := range s.config.MetaAsTags {
|
for _, k := range s.config.MetaAsTags {
|
||||||
s.meta_as_tags[k] = true
|
s.meta_as_tags[k] = true
|
||||||
}
|
}
|
||||||
|
s.sentMetrics = 0
|
||||||
|
|
||||||
return s, nil
|
return s, nil
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user