2021-03-25 14:46:25 +01:00
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
|
|
|
"encoding/json"
|
2021-05-14 19:22:42 +02:00
|
|
|
"flag"
|
2021-03-25 14:46:25 +01:00
|
|
|
"fmt"
|
2021-03-26 13:08:44 +01:00
|
|
|
"github.com/ClusterCockpit/cc-metric-collector/collectors"
|
2021-05-18 15:16:10 +02:00
|
|
|
"github.com/ClusterCockpit/cc-metric-collector/receivers"
|
2021-03-26 16:48:09 +01:00
|
|
|
"github.com/ClusterCockpit/cc-metric-collector/sinks"
|
2021-03-25 14:46:25 +01:00
|
|
|
"log"
|
|
|
|
"os"
|
|
|
|
"os/signal"
|
2021-03-26 13:08:44 +01:00
|
|
|
"strings"
|
2021-03-25 14:46:25 +01:00
|
|
|
"sync"
|
|
|
|
"time"
|
|
|
|
)
|
|
|
|
|
2021-03-26 13:08:44 +01:00
|
|
|
// List of provided collectors. Which collector should be run can be
|
|
|
|
// configured at 'collectors' list in 'config.json'.
|
2021-03-25 14:46:25 +01:00
|
|
|
var Collectors = map[string]collectors.MetricGetter{
|
2021-03-25 17:47:08 +01:00
|
|
|
"likwid": &collectors.LikwidCollector{},
|
|
|
|
"loadavg": &collectors.LoadavgCollector{},
|
|
|
|
"memstat": &collectors.MemstatCollector{},
|
|
|
|
"netstat": &collectors.NetstatCollector{},
|
|
|
|
"ibstat": &collectors.InfinibandCollector{},
|
|
|
|
"lustrestat": &collectors.LustreCollector{},
|
2021-05-14 19:22:42 +02:00
|
|
|
"cpustat": &collectors.CpustatCollector{},
|
|
|
|
"topprocs": &collectors.TopProcsCollector{},
|
|
|
|
"nvidia": &collectors.NvidiaCollector{},
|
2021-03-25 14:46:25 +01:00
|
|
|
}
|
2021-03-25 17:47:08 +01:00
|
|
|
|
2021-03-26 16:48:09 +01:00
|
|
|
var Sinks = map[string]sinks.SinkFuncs{
|
|
|
|
"influxdb": &sinks.InfluxSink{},
|
2021-05-14 19:22:42 +02:00
|
|
|
"stdout": &sinks.StdoutSink{},
|
|
|
|
"nats": &sinks.NatsSink{},
|
2021-03-26 16:48:09 +01:00
|
|
|
}
|
2021-03-26 10:19:54 +01:00
|
|
|
|
2021-05-18 15:16:10 +02:00
|
|
|
var Receivers = map[string]receivers.ReceiverFuncs{
|
|
|
|
"nats": &receivers.NatsReceiver{},
|
|
|
|
}
|
|
|
|
|
2021-03-26 13:08:44 +01:00
|
|
|
// Structure of the configuration file
|
2021-03-25 14:46:25 +01:00
|
|
|
type GlobalConfig struct {
|
2021-05-18 15:16:10 +02:00
|
|
|
Sink sinks.SinkConfig `json:"sink"`
|
|
|
|
Interval int `json:"interval"`
|
|
|
|
Duration int `json:"duration"`
|
|
|
|
Collectors []string `json:"collectors"`
|
|
|
|
Receiver receivers.ReceiverConfig `json:"receiver"`
|
2021-05-19 01:34:30 +02:00
|
|
|
DefTags map[string]string `json:"default_tags"`
|
2021-03-25 14:46:25 +01:00
|
|
|
}
|
|
|
|
|
2021-03-26 13:08:44 +01:00
|
|
|
// Load JSON configuration file
|
2021-03-25 14:46:25 +01:00
|
|
|
func LoadConfiguration(file string, config *GlobalConfig) error {
|
|
|
|
configFile, err := os.Open(file)
|
|
|
|
defer configFile.Close()
|
|
|
|
if err != nil {
|
|
|
|
fmt.Println(err.Error())
|
2021-05-11 12:41:29 +02:00
|
|
|
return err
|
2021-03-25 14:46:25 +01:00
|
|
|
}
|
|
|
|
jsonParser := json.NewDecoder(configFile)
|
|
|
|
jsonParser.Decode(config)
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2021-05-11 13:16:35 +02:00
|
|
|
func ReadCli() map[string]string {
|
2021-05-14 19:22:42 +02:00
|
|
|
var m map[string]string
|
|
|
|
cfg := flag.String("config", "./config.json", "Path to configuration file")
|
|
|
|
logfile := flag.String("log", "stderr", "Path for logfile")
|
2021-05-29 03:40:12 +02:00
|
|
|
pidfile := flag.String("pidfile", "/var/run/cc-metric-collector.pid", "Path for PID file")
|
2021-05-14 19:22:42 +02:00
|
|
|
flag.Parse()
|
|
|
|
m = make(map[string]string)
|
|
|
|
m["configfile"] = *cfg
|
|
|
|
m["logfile"] = *logfile
|
2021-05-29 03:40:12 +02:00
|
|
|
m["pidfile"] = *pidfile
|
2021-05-14 19:22:42 +02:00
|
|
|
return m
|
2021-05-11 13:16:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func SetLogging(logfile string) error {
|
2021-05-14 19:22:42 +02:00
|
|
|
var file *os.File
|
|
|
|
var err error
|
|
|
|
if logfile != "stderr" {
|
|
|
|
file, err = os.OpenFile(logfile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0600)
|
|
|
|
if err != nil {
|
|
|
|
log.Fatal(err)
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
file = os.Stderr
|
|
|
|
}
|
|
|
|
log.SetOutput(file)
|
|
|
|
return nil
|
2021-05-11 12:41:29 +02:00
|
|
|
}
|
|
|
|
|
2021-05-29 03:40:12 +02:00
|
|
|
func CreatePidfile(pidfile string) error {
|
|
|
|
file, err := os.OpenFile(pidfile, os.O_CREATE|os.O_RDWR, 0600)
|
|
|
|
if err != nil {
|
|
|
|
log.Print(err)
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
file.Write([]byte(fmt.Sprintf("%d", os.Getpid())))
|
|
|
|
file.Close()
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func RemovePidfile(pidfile string) error {
|
|
|
|
info, err := os.Stat(pidfile)
|
|
|
|
if !os.IsNotExist(err) && !info.IsDir() {
|
|
|
|
os.Remove(pidfile)
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2021-03-26 13:08:44 +01:00
|
|
|
// Register an interrupt handler for Ctrl+C and similar. At signal,
|
|
|
|
// all collectors are closed
|
2021-05-29 03:40:12 +02:00
|
|
|
func shutdown(wg *sync.WaitGroup, config *GlobalConfig, sink sinks.SinkFuncs, recv receivers.ReceiverFuncs, pidfile string) {
|
2021-03-25 14:46:25 +01:00
|
|
|
sigs := make(chan os.Signal, 1)
|
|
|
|
signal.Notify(sigs, os.Interrupt)
|
|
|
|
|
|
|
|
go func(wg *sync.WaitGroup) {
|
|
|
|
<-sigs
|
|
|
|
log.Print("Shutdown...")
|
|
|
|
for _, c := range config.Collectors {
|
|
|
|
col := Collectors[c]
|
|
|
|
log.Print("Stop ", col.Name())
|
|
|
|
col.Close()
|
|
|
|
}
|
|
|
|
time.Sleep(1 * time.Second)
|
2021-05-18 15:16:10 +02:00
|
|
|
if recv != nil {
|
|
|
|
recv.Close()
|
|
|
|
}
|
2021-03-26 16:48:09 +01:00
|
|
|
sink.Close()
|
2021-05-29 03:40:12 +02:00
|
|
|
RemovePidfile(pidfile)
|
2021-03-25 14:46:25 +01:00
|
|
|
wg.Done()
|
|
|
|
}(wg)
|
|
|
|
}
|
|
|
|
|
|
|
|
func main() {
|
|
|
|
var config GlobalConfig
|
|
|
|
var wg sync.WaitGroup
|
2021-05-18 15:16:10 +02:00
|
|
|
var recv receivers.ReceiverFuncs = nil
|
|
|
|
var use_recv bool
|
|
|
|
use_recv = false
|
2021-03-25 14:46:25 +01:00
|
|
|
wg.Add(1)
|
|
|
|
host, err := os.Hostname()
|
2021-03-25 17:47:08 +01:00
|
|
|
if err != nil {
|
|
|
|
log.Print(err)
|
|
|
|
return
|
|
|
|
}
|
2021-05-11 13:16:35 +02:00
|
|
|
clicfg := ReadCli()
|
2021-05-29 03:40:12 +02:00
|
|
|
err = CreatePidfile(clicfg["pidfile"])
|
2021-05-11 13:16:35 +02:00
|
|
|
err = SetLogging(clicfg["logfile"])
|
2021-05-14 19:22:42 +02:00
|
|
|
if err != nil {
|
|
|
|
log.Print("Error setting up logging system to ", clicfg["logfile"])
|
|
|
|
return
|
2021-05-11 13:16:35 +02:00
|
|
|
}
|
2021-03-25 14:46:25 +01:00
|
|
|
|
2021-03-26 13:08:44 +01:00
|
|
|
// Load and check configuration
|
2021-05-11 13:16:35 +02:00
|
|
|
err = LoadConfiguration(clicfg["configfile"], &config)
|
2021-05-14 19:22:42 +02:00
|
|
|
if err != nil {
|
|
|
|
log.Print("Error reading configuration file ", clicfg["configfile"])
|
|
|
|
return
|
2021-05-11 12:41:29 +02:00
|
|
|
}
|
2021-03-25 17:47:08 +01:00
|
|
|
if config.Interval <= 0 || time.Duration(config.Interval)*time.Second <= 0 {
|
|
|
|
log.Print("Configuration value 'interval' must be greater than zero")
|
|
|
|
return
|
2021-03-25 14:46:25 +01:00
|
|
|
}
|
|
|
|
if config.Duration <= 0 {
|
2021-03-25 17:47:08 +01:00
|
|
|
log.Print("Configuration value 'duration' must be greater than zero")
|
|
|
|
return
|
2021-03-25 14:46:25 +01:00
|
|
|
}
|
2021-03-26 13:08:44 +01:00
|
|
|
if len(config.Collectors) == 0 {
|
|
|
|
var keys []string
|
|
|
|
for k := range Collectors {
|
|
|
|
keys = append(keys, k)
|
|
|
|
}
|
|
|
|
log.Print("Configuration value 'collectors' does not contain any collector. Available: ", strings.Join(keys, ", "))
|
|
|
|
return
|
|
|
|
}
|
|
|
|
for _, name := range config.Collectors {
|
|
|
|
if _, found := Collectors[name]; !found {
|
|
|
|
log.Print("Invalid collector '", name, "' in configuration")
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
2021-03-26 16:48:09 +01:00
|
|
|
if _, found := Sinks[config.Sink.Type]; !found {
|
|
|
|
log.Print("Invalid sink type '", config.Sink.Type, "' in configuration")
|
|
|
|
return
|
|
|
|
}
|
|
|
|
// Setup sink
|
|
|
|
sink := Sinks[config.Sink.Type]
|
2021-05-18 15:16:10 +02:00
|
|
|
err = sink.Init(config.Sink)
|
2021-03-26 16:48:09 +01:00
|
|
|
if err != nil {
|
2021-05-18 15:16:10 +02:00
|
|
|
log.Print(err)
|
2021-03-26 16:48:09 +01:00
|
|
|
return
|
|
|
|
}
|
2021-05-18 15:16:10 +02:00
|
|
|
// Setup receiver
|
2021-05-18 15:44:32 +02:00
|
|
|
if len(config.Receiver.Type) > 0 && config.Receiver.Type != "none" {
|
2021-05-18 15:16:10 +02:00
|
|
|
if _, found := Receivers[config.Receiver.Type]; !found {
|
|
|
|
log.Print("Invalid receiver type '", config.Receiver.Type, "' in configuration")
|
|
|
|
return
|
|
|
|
} else {
|
|
|
|
recv = Receivers[config.Receiver.Type]
|
|
|
|
err = recv.Init(config.Receiver, sink)
|
|
|
|
if err == nil {
|
|
|
|
use_recv = true
|
|
|
|
} else {
|
|
|
|
log.Print(err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2021-03-26 10:19:54 +01:00
|
|
|
|
2021-03-26 16:48:09 +01:00
|
|
|
// Register interrupt handler
|
2021-05-29 03:40:12 +02:00
|
|
|
shutdown(&wg, &config, sink, recv, clicfg["pidfile"])
|
2021-03-26 10:19:54 +01:00
|
|
|
|
2021-03-26 13:08:44 +01:00
|
|
|
// Initialize all collectors
|
2021-05-14 19:22:42 +02:00
|
|
|
tmp := make([]string, 0)
|
2021-03-25 14:46:25 +01:00
|
|
|
for _, c := range config.Collectors {
|
|
|
|
col := Collectors[c]
|
2021-05-14 19:22:42 +02:00
|
|
|
err = col.Init()
|
|
|
|
if err != nil {
|
|
|
|
log.Print("SKIP ", col.Name())
|
|
|
|
} else {
|
|
|
|
log.Print("Start ", col.Name())
|
|
|
|
tmp = append(tmp, c)
|
|
|
|
}
|
2021-03-25 14:46:25 +01:00
|
|
|
}
|
2021-05-14 19:22:42 +02:00
|
|
|
config.Collectors = tmp
|
2021-03-26 10:19:54 +01:00
|
|
|
|
2021-03-26 13:08:44 +01:00
|
|
|
// Setup up ticker loop
|
|
|
|
log.Print("Running loop every ", time.Duration(config.Interval)*time.Second)
|
2021-03-25 14:46:25 +01:00
|
|
|
ticker := time.NewTicker(time.Duration(config.Interval) * time.Second)
|
|
|
|
done := make(chan bool)
|
2021-03-26 10:19:54 +01:00
|
|
|
|
2021-03-26 13:08:44 +01:00
|
|
|
// Storage for all node metrics
|
2021-03-26 10:19:54 +01:00
|
|
|
nodeFields := make(map[string]interface{})
|
|
|
|
|
2021-03-26 13:08:44 +01:00
|
|
|
// Storage for all socket metrics
|
2021-03-25 14:46:25 +01:00
|
|
|
slist := collectors.SocketList()
|
2021-03-26 10:19:54 +01:00
|
|
|
socketsFields := make(map[int]map[string]interface{}, len(slist))
|
2021-03-25 17:47:08 +01:00
|
|
|
for _, s := range slist {
|
2021-03-26 10:19:54 +01:00
|
|
|
socketsFields[s] = make(map[string]interface{})
|
2021-03-25 17:47:08 +01:00
|
|
|
}
|
2021-03-26 10:19:54 +01:00
|
|
|
|
2021-03-26 13:08:44 +01:00
|
|
|
// Storage for all CPU metrics
|
2021-03-25 17:47:08 +01:00
|
|
|
clist := collectors.CpuList()
|
2021-03-26 10:19:54 +01:00
|
|
|
cpuFields := make(map[int]map[string]interface{}, len(clist))
|
2021-03-25 17:47:08 +01:00
|
|
|
for _, s := range clist {
|
2021-03-26 10:19:54 +01:00
|
|
|
cpuFields[s] = make(map[string]interface{})
|
2021-03-25 17:47:08 +01:00
|
|
|
}
|
2021-03-25 14:46:25 +01:00
|
|
|
|
2021-05-18 15:16:10 +02:00
|
|
|
// Start receiver
|
|
|
|
if use_recv {
|
|
|
|
recv.Start()
|
|
|
|
}
|
|
|
|
|
2021-03-25 14:46:25 +01:00
|
|
|
go func() {
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-done:
|
|
|
|
return
|
2021-03-25 17:47:08 +01:00
|
|
|
case t := <-ticker.C:
|
2021-03-26 13:08:44 +01:00
|
|
|
// Count how many socket and cpu metrics are returned
|
2021-03-25 17:47:08 +01:00
|
|
|
scount := 0
|
|
|
|
ccount := 0
|
2021-03-26 10:19:54 +01:00
|
|
|
|
2021-03-26 13:08:44 +01:00
|
|
|
// Read all collectors are sort the results in the right
|
|
|
|
// storage locations
|
2021-03-25 17:47:08 +01:00
|
|
|
for _, c := range config.Collectors {
|
|
|
|
col := Collectors[c]
|
|
|
|
col.Read(time.Duration(config.Duration))
|
2021-03-26 10:19:54 +01:00
|
|
|
|
2021-03-25 17:47:08 +01:00
|
|
|
for key, val := range col.GetNodeMetric() {
|
2021-03-26 10:19:54 +01:00
|
|
|
nodeFields[key] = val
|
2021-03-25 17:47:08 +01:00
|
|
|
}
|
|
|
|
for sid, socket := range col.GetSocketMetrics() {
|
|
|
|
for key, val := range socket {
|
2021-03-26 10:19:54 +01:00
|
|
|
socketsFields[sid][key] = val
|
2021-03-25 17:47:08 +01:00
|
|
|
scount++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for cid, cpu := range col.GetCpuMetrics() {
|
|
|
|
for key, val := range cpu {
|
2021-03-26 10:19:54 +01:00
|
|
|
cpuFields[cid][key] = val
|
2021-03-25 17:47:08 +01:00
|
|
|
ccount++
|
|
|
|
}
|
|
|
|
}
|
2021-03-25 14:46:25 +01:00
|
|
|
}
|
2021-03-26 16:48:09 +01:00
|
|
|
|
2021-03-26 13:08:44 +01:00
|
|
|
// Send out node metrics
|
2021-05-14 19:22:42 +02:00
|
|
|
if len(nodeFields) > 0 {
|
2021-05-19 01:34:30 +02:00
|
|
|
nodeTags := map[string]string{"host": host}
|
2021-05-29 03:40:12 +02:00
|
|
|
for k, v := range config.DefTags {
|
|
|
|
nodeTags[k] = v
|
2021-05-19 01:34:30 +02:00
|
|
|
}
|
|
|
|
sink.Write("node", nodeTags, nodeFields, t)
|
2021-05-14 19:22:42 +02:00
|
|
|
}
|
2021-03-26 10:19:54 +01:00
|
|
|
|
2021-03-26 13:08:44 +01:00
|
|
|
// Send out socket metrics (if any)
|
2021-03-25 17:47:08 +01:00
|
|
|
if scount > 0 {
|
2021-03-26 10:19:54 +01:00
|
|
|
for sid, socket := range socketsFields {
|
2021-05-14 19:22:42 +02:00
|
|
|
if len(socket) > 0 {
|
2021-05-19 01:34:30 +02:00
|
|
|
socketTags := map[string]string{"socket": fmt.Sprintf("%d", sid), "host": host}
|
2021-05-29 03:40:12 +02:00
|
|
|
for k, v := range config.DefTags {
|
|
|
|
socketTags[k] = v
|
2021-05-19 01:34:30 +02:00
|
|
|
}
|
|
|
|
sink.Write("socket", socketTags, socket, t)
|
2021-05-14 19:22:42 +02:00
|
|
|
}
|
2021-03-25 17:47:08 +01:00
|
|
|
}
|
|
|
|
}
|
2021-03-26 10:19:54 +01:00
|
|
|
|
2021-03-26 13:08:44 +01:00
|
|
|
// Send out CPU metrics (if any)
|
2021-03-25 17:47:08 +01:00
|
|
|
if ccount > 0 {
|
2021-03-26 10:19:54 +01:00
|
|
|
for cid, cpu := range cpuFields {
|
2021-05-14 19:22:42 +02:00
|
|
|
if len(cpu) > 0 {
|
2021-05-19 01:34:30 +02:00
|
|
|
cpuTags := map[string]string{"cpu": fmt.Sprintf("%d", cid), "host": host}
|
2021-05-29 03:40:12 +02:00
|
|
|
for k, v := range config.DefTags {
|
|
|
|
cpuTags[k] = v
|
2021-05-19 01:34:30 +02:00
|
|
|
}
|
|
|
|
sink.Write("cpu", cpuTags, cpu, t)
|
2021-05-14 19:22:42 +02:00
|
|
|
}
|
2021-03-25 17:47:08 +01:00
|
|
|
}
|
|
|
|
}
|
2021-03-25 14:46:25 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
2021-03-26 13:08:44 +01:00
|
|
|
// Wait until receiving an interrupt
|
2021-03-25 14:46:25 +01:00
|
|
|
wg.Wait()
|
|
|
|
}
|