mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2024-12-26 15:29:04 +01:00
Collector-specific configuration. LIKWID collector derives metrics itself, Run once CLI option
This commit is contained in:
parent
57948e8cff
commit
51b8c62d4d
@ -7,24 +7,40 @@ import (
|
|||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
"encoding/json"
|
||||||
)
|
)
|
||||||
|
|
||||||
const CPUSTATFILE = `/proc/stat`
|
const CPUSTATFILE = `/proc/stat`
|
||||||
|
|
||||||
type CpustatCollector struct {
|
type CpustatCollectorConfig struct {
|
||||||
MetricCollector
|
ExcludeMetrics []string `json:"exclude_metrics, omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *CpustatCollector) Init() error {
|
type CpustatCollector struct {
|
||||||
|
MetricCollector
|
||||||
|
config CpustatCollectorConfig
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *CpustatCollector) Init(config []byte) error {
|
||||||
m.name = "CpustatCollector"
|
m.name = "CpustatCollector"
|
||||||
m.setup()
|
m.setup()
|
||||||
|
if len(config) > 0 {
|
||||||
|
err := json.Unmarshal(config, &m.config)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func ParseStatLine(line string, cpu int, out *[]lp.MutableMetric) {
|
|
||||||
|
func ParseStatLine(line string, cpu int, exclude []string, out *[]lp.MutableMetric) {
|
||||||
ls := strings.Fields(line)
|
ls := strings.Fields(line)
|
||||||
matches := []string{"", "cpu_user", "cpu_nice", "cpu_system", "cpu_idle", "cpu_iowait", "cpu_irq", "cpu_softirq", "cpu_steal", "cpu_guest", "cpu_guest_nice"}
|
matches := []string{"", "cpu_user", "cpu_nice", "cpu_system", "cpu_idle", "cpu_iowait", "cpu_irq", "cpu_softirq", "cpu_steal", "cpu_guest", "cpu_guest_nice"}
|
||||||
|
for _, ex := range exclude {
|
||||||
|
matches, _ = RemoveFromStringList(matches, ex)
|
||||||
|
}
|
||||||
|
|
||||||
var tags map[string]string
|
var tags map[string]string
|
||||||
if cpu < 0 {
|
if cpu < 0 {
|
||||||
@ -46,6 +62,9 @@ func ParseStatLine(line string, cpu int, out *[]lp.MutableMetric) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *CpustatCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
func (m *CpustatCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
||||||
|
if (!m.init) {
|
||||||
|
return
|
||||||
|
}
|
||||||
buffer, err := ioutil.ReadFile(string(CPUSTATFILE))
|
buffer, err := ioutil.ReadFile(string(CPUSTATFILE))
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -59,11 +78,11 @@ func (m *CpustatCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
|
|||||||
}
|
}
|
||||||
ls := strings.Fields(line)
|
ls := strings.Fields(line)
|
||||||
if strings.Compare(ls[0], "cpu") == 0 {
|
if strings.Compare(ls[0], "cpu") == 0 {
|
||||||
ParseStatLine(line, -1, out)
|
ParseStatLine(line, -1, m.config.ExcludeMetrics, out)
|
||||||
} else if strings.HasPrefix(ls[0], "cpu") {
|
} else if strings.HasPrefix(ls[0], "cpu") {
|
||||||
cpustr := strings.TrimLeft(ls[0], "cpu")
|
cpustr := strings.TrimLeft(ls[0], "cpu")
|
||||||
cpu, _ := strconv.Atoi(cpustr)
|
cpu, _ := strconv.Atoi(cpustr)
|
||||||
ParseStatLine(line, cpu, out)
|
ParseStatLine(line, cpu, m.config.ExcludeMetrics, out)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,25 +1,65 @@
|
|||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"errors"
|
||||||
lp "github.com/influxdata/line-protocol"
|
lp "github.com/influxdata/line-protocol"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"log"
|
"log"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"time"
|
"time"
|
||||||
|
"encoding/json"
|
||||||
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
const CUSTOMCMDPATH = `/home/unrz139/Work/cc-metric-collector/collectors/custom`
|
const CUSTOMCMDPATH = `/home/unrz139/Work/cc-metric-collector/collectors/custom`
|
||||||
|
|
||||||
|
type CustomCmdCollectorConfig struct {
|
||||||
|
commands []string `json:"commands"`
|
||||||
|
files []string `json:"files"`
|
||||||
|
ExcludeMetrics []string `json:"exclude_metrics"`
|
||||||
|
}
|
||||||
|
|
||||||
type CustomCmdCollector struct {
|
type CustomCmdCollector struct {
|
||||||
MetricCollector
|
MetricCollector
|
||||||
handler *lp.MetricHandler
|
handler *lp.MetricHandler
|
||||||
parser *lp.Parser
|
parser *lp.Parser
|
||||||
|
config CustomCmdCollectorConfig
|
||||||
|
commands []string
|
||||||
|
files []string
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *CustomCmdCollector) Init() error {
|
func (m *CustomCmdCollector) Init(config []byte) error {
|
||||||
|
var err error
|
||||||
m.name = "CustomCmdCollector"
|
m.name = "CustomCmdCollector"
|
||||||
|
if len(config) > 0 {
|
||||||
|
err = json.Unmarshal(config, &m.config)
|
||||||
|
if err != nil {
|
||||||
|
log.Print(err.Error())
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
m.setup()
|
m.setup()
|
||||||
|
for _, c := range m.config.commands {
|
||||||
|
cmdfields := strings.Fields(c)
|
||||||
|
command := exec.Command(cmdfields[0], strings.Join(cmdfields[1:], " "))
|
||||||
|
command.Wait()
|
||||||
|
_, err = command.Output()
|
||||||
|
if err != nil {
|
||||||
|
m.commands = append(m.commands, c)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, f := range m.config.files {
|
||||||
|
_, err = ioutil.ReadFile(f)
|
||||||
|
if err == nil {
|
||||||
|
m.files = append(m.files, f)
|
||||||
|
} else {
|
||||||
|
log.Print(err.Error())
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(m.files) == 0 && len(m.commands) == 0 {
|
||||||
|
return errors.New("No metrics to collect")
|
||||||
|
}
|
||||||
m.handler = lp.NewMetricHandler()
|
m.handler = lp.NewMetricHandler()
|
||||||
m.parser = lp.NewParser(m.handler)
|
m.parser = lp.NewParser(m.handler)
|
||||||
m.parser.SetTimeFunc(DefaultTime)
|
m.parser.SetTimeFunc(DefaultTime)
|
||||||
@ -32,73 +72,55 @@ var DefaultTime = func() time.Time {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *CustomCmdCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
func (m *CustomCmdCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
||||||
files, err := ioutil.ReadDir(string(CUSTOMCMDPATH))
|
if !m.init {
|
||||||
if err != nil {
|
return
|
||||||
log.Print(err)
|
}
|
||||||
return
|
for _, cmd := range m.commands {
|
||||||
}
|
cmdfields := strings.Fields(cmd)
|
||||||
for _, file := range files {
|
command := exec.Command(cmdfields[0], strings.Join(cmdfields[1:], " "))
|
||||||
// stat, err := os.Stat(file)
|
|
||||||
// if err != nil {
|
|
||||||
// log.Print(err)
|
|
||||||
// continue
|
|
||||||
// }
|
|
||||||
// mode := stat.Mode()
|
|
||||||
// if mode & 0o555 {
|
|
||||||
path := fmt.Sprintf("%s/%s", string(CUSTOMCMDPATH), file.Name())
|
|
||||||
command := exec.Command(path, "")
|
|
||||||
command.Wait()
|
command.Wait()
|
||||||
stdout, err := command.Output()
|
stdout, err := command.Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Print(err)
|
log.Print(err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
metrics, err := m.parser.Parse(stdout)
|
cmdmetrics, err := m.parser.Parse(stdout)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Print(err)
|
log.Print(err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
for _, m := range metrics {
|
for _, c := range cmdmetrics {
|
||||||
y, err := lp.New(m.Name(), Tags2Map(m), Fields2Map(m), m.Time())
|
_, skip := stringArrayContains(m.config.ExcludeMetrics, c.Name())
|
||||||
if err == nil {
|
if skip {
|
||||||
*out = append(*out, y)
|
continue
|
||||||
}
|
}
|
||||||
// switch m.Name() {
|
y, err := lp.New(c.Name(), Tags2Map(c), Fields2Map(c), c.Time())
|
||||||
// case "node":
|
if err == nil {
|
||||||
// for k, v := range m.FieldList() {
|
*out = append(*out, y)
|
||||||
// m.node[k] = float64(v)
|
}
|
||||||
// }
|
}
|
||||||
// case "socket":
|
}
|
||||||
// tlist := m.TagList()
|
for _, file := range m.files {
|
||||||
// if id, found := tlist["socket"]; found {
|
buffer, err := ioutil.ReadFile(file)
|
||||||
// for k, v := range m.FieldList() {
|
if err != nil {
|
||||||
// m.socket[id][k] = float64(v)
|
log.Print(err)
|
||||||
// }
|
return
|
||||||
// }
|
}
|
||||||
// case "cpu":
|
fmetrics, err := m.parser.Parse(buffer)
|
||||||
// tlist := m.TagList()
|
if err != nil {
|
||||||
// if id, found := tlist["cpu"]; found {
|
log.Print(err)
|
||||||
// for k, v := range m.FieldList() {
|
continue
|
||||||
// m.cpu[id][k] = float64(v)
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// case "network":
|
|
||||||
// tlist := m.TagList()
|
|
||||||
// if id, found := tlist["device"]; found {
|
|
||||||
// for k, v := range m.FieldList() {
|
|
||||||
// m.network[id][k] = float64(v)
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// case "accelerator":
|
|
||||||
// tlist := m.TagList()
|
|
||||||
// if id, found := tlist["device"]; found {
|
|
||||||
// for k, v := range m.FieldList() {
|
|
||||||
// m.accelerator[id][k] = float64(v)
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
}
|
}
|
||||||
// } if file is executable check
|
for _, f := range fmetrics {
|
||||||
|
_, skip := stringArrayContains(m.config.ExcludeMetrics, f.Name())
|
||||||
|
if skip {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
y, err := lp.New(f.Name(), Tags2Map(f), Fields2Map(f), f.Time())
|
||||||
|
if err == nil {
|
||||||
|
*out = append(*out, y)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,28 +1,42 @@
|
|||||||
package collectors
|
package collectors
|
||||||
|
|
||||||
import (
|
import (
|
||||||
// "errors"
|
|
||||||
// "fmt"
|
|
||||||
lp "github.com/influxdata/line-protocol"
|
lp "github.com/influxdata/line-protocol"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"log"
|
// "log"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
)
|
)
|
||||||
|
|
||||||
const DISKSTATFILE = `/proc/diskstats`
|
const DISKSTATFILE = `/proc/diskstats`
|
||||||
|
const DISKSTAT_SYSFSPATH = `/sys/block`
|
||||||
|
|
||||||
|
type DiskstatCollectorConfig struct {
|
||||||
|
ExcludeMetrics []string `json:"exclude_metrics, omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
type DiskstatCollector struct {
|
type DiskstatCollector struct {
|
||||||
MetricCollector
|
MetricCollector
|
||||||
matches map[int]string
|
matches map[int]string
|
||||||
|
config DiskstatCollectorConfig
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *DiskstatCollector) Init() error {
|
|
||||||
|
func (m *DiskstatCollector) Init(config []byte) error {
|
||||||
|
var err error
|
||||||
m.name = "DiskstatCollector"
|
m.name = "DiskstatCollector"
|
||||||
m.setup()
|
m.setup()
|
||||||
|
if len(config) > 0 {
|
||||||
|
err = json.Unmarshal(config, &m.config)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
// https://www.kernel.org/doc/html/latest/admin-guide/iostats.html
|
// https://www.kernel.org/doc/html/latest/admin-guide/iostats.html
|
||||||
m.matches = map[int]string{
|
matches := map[int]string{
|
||||||
3: "reads",
|
3: "reads",
|
||||||
4: "reads_merged",
|
4: "reads_merged",
|
||||||
5: "read_sectors",
|
5: "read_sectors",
|
||||||
@ -41,46 +55,60 @@ func (m *DiskstatCollector) Init() error {
|
|||||||
18: "flushes",
|
18: "flushes",
|
||||||
19: "flushes_ms",
|
19: "flushes_ms",
|
||||||
}
|
}
|
||||||
_, err := ioutil.ReadFile(string(DISKSTATFILE))
|
m.matches = make(map[int]string)
|
||||||
if err == nil {
|
for k, v := range matches {
|
||||||
m.init = true
|
_, skip := stringArrayContains(m.config.ExcludeMetrics, v)
|
||||||
|
if (!skip) {
|
||||||
|
m.matches[k] = v
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
if len(m.matches) == 0 {
|
||||||
|
return errors.New("No metrics to collect")
|
||||||
|
}
|
||||||
|
_, err = ioutil.ReadFile(string(DISKSTATFILE))
|
||||||
|
if err == nil {
|
||||||
|
m.init = true
|
||||||
|
}
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
func (m *DiskstatCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
func (m *DiskstatCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
||||||
|
var lines []string
|
||||||
|
if !m.init {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
buffer, err := ioutil.ReadFile(string(DISKSTATFILE))
|
buffer, err := ioutil.ReadFile(string(DISKSTATFILE))
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
lines = strings.Split(string(buffer), "\n")
|
||||||
|
|
||||||
if err != nil {
|
for _, line := range lines {
|
||||||
log.Print(err)
|
if len(line) == 0 {
|
||||||
return
|
continue
|
||||||
}
|
}
|
||||||
|
f := strings.Fields(line)
|
||||||
ll := strings.Split(string(buffer), "\n")
|
if strings.Contains(f[2], "loop") {
|
||||||
|
continue
|
||||||
for _, line := range ll {
|
}
|
||||||
if len(line) == 0 {
|
tags := map[string]string{
|
||||||
continue
|
"device": f[2],
|
||||||
}
|
"type": "node",
|
||||||
f := strings.Fields(line)
|
}
|
||||||
if strings.Contains(f[2], "loop") {
|
for idx, name := range m.matches {
|
||||||
continue
|
if idx < len(f) {
|
||||||
}
|
x, err := strconv.ParseInt(f[idx], 0, 64)
|
||||||
tags := map[string]string{
|
if err == nil {
|
||||||
"device": f[2],
|
y, err := lp.New(name, tags, map[string]interface{}{"value": int(x)}, time.Now())
|
||||||
"type": "node",
|
if err == nil {
|
||||||
}
|
*out = append(*out, y)
|
||||||
for idx, name := range m.matches {
|
}
|
||||||
x, err := strconv.ParseInt(f[idx], 0, 64)
|
}
|
||||||
if err == nil {
|
}
|
||||||
y, err := lp.New(name, tags, map[string]interface{}{"value": int(x)}, time.Now())
|
}
|
||||||
if err == nil {
|
}
|
||||||
*out = append(*out, y)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6,59 +6,104 @@ import (
|
|||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"log"
|
"log"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
|
// "os"
|
||||||
|
"path/filepath"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const BASEPATH = `/sys/class/infiniband/`
|
||||||
const LIDFILE = `/sys/class/infiniband/mlx4_0/ports/1/lid`
|
const LIDFILE = `/sys/class/infiniband/mlx4_0/ports/1/lid`
|
||||||
const PERFQUERY = `/usr/sbin/perfquery`
|
const PERFQUERY = `/usr/sbin/perfquery`
|
||||||
|
|
||||||
|
type InfinibandCollectorConfig struct {
|
||||||
|
ExcludeDevices []string `json:"exclude_devices, omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
type InfinibandCollector struct {
|
type InfinibandCollector struct {
|
||||||
MetricCollector
|
MetricCollector
|
||||||
tags map[string]string
|
tags map[string]string
|
||||||
|
lids map[string]map[string]string
|
||||||
|
config NetstatCollectorConfig
|
||||||
|
use_perfquery bool
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *InfinibandCollector) Init() error {
|
func (m *InfinibandCollector) Init(config []byte) error {
|
||||||
|
var err error
|
||||||
m.name = "InfinibandCollector"
|
m.name = "InfinibandCollector"
|
||||||
|
m.use_perfquery = false
|
||||||
m.setup()
|
m.setup()
|
||||||
m.tags = map[string]string{"type": "node"}
|
m.tags = map[string]string{"type": "node"}
|
||||||
_, err := ioutil.ReadFile(string(LIDFILE))
|
if len(config) > 0 {
|
||||||
if err == nil {
|
err = json.Unmarshal(config, &m.config)
|
||||||
_, err = ioutil.ReadFile(string(PERFQUERY))
|
if err != nil {
|
||||||
if err == nil {
|
return err
|
||||||
m.init = true
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
m.lids = make(map[string]map[string]string)
|
||||||
|
p := fmt.Sprintf("%s/*/ports/*/lid", string(BASEPATH))
|
||||||
|
files, err := filepath.Glob(p)
|
||||||
|
for _, f := range(files) {
|
||||||
|
lid, err := ioutil.ReadFile(f)
|
||||||
|
if err == nil {
|
||||||
|
plist := strings.Split(strings.Replace(f, string(BASEPATH), "", -1), "/")
|
||||||
|
skip := false
|
||||||
|
for _, d := range m.config.ExcludeDevices {
|
||||||
|
if d == plist[0] {
|
||||||
|
skip = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !skip {
|
||||||
|
m.lids[plist[0]] = make(map[string]string)
|
||||||
|
m.lids[plist[0]][plist[2]] = string(lid)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, ports := range m.lids {
|
||||||
|
for port, lid := range ports {
|
||||||
|
args := fmt.Sprintf("-r %s %s 0xf000", lid, port)
|
||||||
|
command := exec.Command(PERFQUERY, args)
|
||||||
|
command.Wait()
|
||||||
|
_, err := command.Output()
|
||||||
|
if (err == nil) {
|
||||||
|
m.use_perfquery = true
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(m.lids) > 0 {
|
||||||
|
m.init = true
|
||||||
|
} else {
|
||||||
|
err = errors.New("No usable devices")
|
||||||
|
}
|
||||||
|
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *InfinibandCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
func DoPerfQuery(dev string, lid string, port string, tags map[string]string, out *[]lp.MutableMetric) error {
|
||||||
buffer, err := ioutil.ReadFile(string(LIDFILE))
|
|
||||||
|
|
||||||
if err != nil {
|
args := fmt.Sprintf("-r %s %s 0xf000", lid, port)
|
||||||
log.Print(err)
|
command := exec.Command(PERFQUERY, args)
|
||||||
return
|
command.Wait()
|
||||||
}
|
stdout, err := command.Output()
|
||||||
|
if err != nil {
|
||||||
args := fmt.Sprintf("-r %s 1 0xf000", string(buffer))
|
log.Print(err)
|
||||||
|
return err
|
||||||
command := exec.Command(PERFQUERY, args)
|
}
|
||||||
command.Wait()
|
ll := strings.Split(string(stdout), "\n")
|
||||||
stdout, err := command.Output()
|
|
||||||
if err != nil {
|
|
||||||
log.Print(err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
ll := strings.Split(string(stdout), "\n")
|
|
||||||
|
|
||||||
for _, line := range ll {
|
for _, line := range ll {
|
||||||
if strings.HasPrefix(line, "PortRcvData") || strings.HasPrefix(line, "RcvData") {
|
if strings.HasPrefix(line, "PortRcvData") || strings.HasPrefix(line, "RcvData") {
|
||||||
lv := strings.Fields(line)
|
lv := strings.Fields(line)
|
||||||
v, err := strconv.ParseFloat(lv[1], 64)
|
v, err := strconv.ParseFloat(lv[1], 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y, err := lp.New("ib_recv", m.tags, map[string]interface{}{"value": float64(v)}, time.Now())
|
y, err := lp.New("ib_recv", tags, map[string]interface{}{"value": float64(v)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
@ -68,13 +113,100 @@ func (m *InfinibandCollector) Read(interval time.Duration, out *[]lp.MutableMetr
|
|||||||
lv := strings.Fields(line)
|
lv := strings.Fields(line)
|
||||||
v, err := strconv.ParseFloat(lv[1], 64)
|
v, err := strconv.ParseFloat(lv[1], 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y, err := lp.New("ib_xmit", m.tags, map[string]interface{}{"value": float64(v)}, time.Now())
|
y, err := lp.New("ib_xmit", tags, map[string]interface{}{"value": float64(v)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func DoSysfsRead(dev string, lid string, port string, tags map[string]string, out *[]lp.MutableMetric) error {
|
||||||
|
path := fmt.Sprintf("%s/%s/ports/%s/counters/", string(BASEPATH), dev, port)
|
||||||
|
buffer, err := ioutil.ReadFile(fmt.Sprintf("%s/port_rcv_data", path))
|
||||||
|
if err == nil {
|
||||||
|
data := strings.Replace(string(buffer), "\n", "", -1)
|
||||||
|
v, err := strconv.ParseFloat(data, 64)
|
||||||
|
if err == nil {
|
||||||
|
y, err := lp.New("ib_recv", tags, map[string]interface{}{"value": float64(v)}, time.Now())
|
||||||
|
if err == nil {
|
||||||
|
*out = append(*out, y)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
buffer, err = ioutil.ReadFile(fmt.Sprintf("%s/port_xmit_data", path))
|
||||||
|
if err == nil {
|
||||||
|
data := strings.Replace(string(buffer), "\n", "", -1)
|
||||||
|
v, err := strconv.ParseFloat(data, 64)
|
||||||
|
if err == nil {
|
||||||
|
y, err := lp.New("ib_xmit", tags, map[string]interface{}{"value": float64(v)}, time.Now())
|
||||||
|
if err == nil {
|
||||||
|
*out = append(*out, y)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *InfinibandCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
||||||
|
|
||||||
|
if m.init {
|
||||||
|
for dev, ports := range m.lids {
|
||||||
|
for port, lid := range ports {
|
||||||
|
tags := map[string]string{"type" : "node", "device" : dev, "port" : port}
|
||||||
|
if m.use_perfquery {
|
||||||
|
DoPerfQuery(dev, lid, port, tags, out)
|
||||||
|
} else {
|
||||||
|
DoSysfsRead(dev, lid, port, tags, out)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// buffer, err := ioutil.ReadFile(string(LIDFILE))
|
||||||
|
|
||||||
|
// if err != nil {
|
||||||
|
// log.Print(err)
|
||||||
|
// return
|
||||||
|
// }
|
||||||
|
|
||||||
|
// args := fmt.Sprintf("-r %s 1 0xf000", string(buffer))
|
||||||
|
|
||||||
|
// command := exec.Command(PERFQUERY, args)
|
||||||
|
// command.Wait()
|
||||||
|
// stdout, err := command.Output()
|
||||||
|
// if err != nil {
|
||||||
|
// log.Print(err)
|
||||||
|
// return
|
||||||
|
// }
|
||||||
|
|
||||||
|
// ll := strings.Split(string(stdout), "\n")
|
||||||
|
|
||||||
|
// for _, line := range ll {
|
||||||
|
// if strings.HasPrefix(line, "PortRcvData") || strings.HasPrefix(line, "RcvData") {
|
||||||
|
// lv := strings.Fields(line)
|
||||||
|
// v, err := strconv.ParseFloat(lv[1], 64)
|
||||||
|
// if err == nil {
|
||||||
|
// y, err := lp.New("ib_recv", m.tags, map[string]interface{}{"value": float64(v)}, time.Now())
|
||||||
|
// if err == nil {
|
||||||
|
// *out = append(*out, y)
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// if strings.HasPrefix(line, "PortXmitData") || strings.HasPrefix(line, "XmtData") {
|
||||||
|
// lv := strings.Fields(line)
|
||||||
|
// v, err := strconv.ParseFloat(lv[1], 64)
|
||||||
|
// if err == nil {
|
||||||
|
// y, err := lp.New("ib_xmit", m.tags, map[string]interface{}{"value": float64(v)}, time.Now())
|
||||||
|
// if err == nil {
|
||||||
|
// *out = append(*out, y)
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *InfinibandCollector) Close() {
|
func (m *InfinibandCollector) Close() {
|
||||||
|
@ -15,15 +15,44 @@ import (
|
|||||||
"log"
|
"log"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
"os"
|
||||||
"unsafe"
|
"unsafe"
|
||||||
|
"math"
|
||||||
|
"encoding/json"
|
||||||
|
"gopkg.in/Knetic/govaluate.v2"
|
||||||
|
"io/ioutil"
|
||||||
|
"strconv"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type LikwidCollectorMetricConfig struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Calc string `json:"calc"`
|
||||||
|
Socket_scope bool `json:"socket_scope"`
|
||||||
|
Publish bool `json:"publish"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type LikwidCollectorEventsetConfig struct {
|
||||||
|
Events map[string]string `json:"events"`
|
||||||
|
Metrics []LikwidCollectorMetricConfig `json:"metrics"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type LikwidCollectorConfig struct {
|
||||||
|
Eventsets []LikwidCollectorEventsetConfig `json:"eventsets"`
|
||||||
|
Metrics []LikwidCollectorMetricConfig `json:"globalmetrics"`
|
||||||
|
ExcludeMetrics []string `json:"exclude_metrics"`
|
||||||
|
}
|
||||||
|
|
||||||
type LikwidCollector struct {
|
type LikwidCollector struct {
|
||||||
MetricCollector
|
MetricCollector
|
||||||
cpulist []C.int
|
cpulist []C.int
|
||||||
sock2tid map[int]int
|
sock2tid map[int]int
|
||||||
metrics map[C.int]map[string]int
|
metrics map[C.int]map[string]int
|
||||||
groups map[string]C.int
|
groups []C.int
|
||||||
|
config LikwidCollectorConfig
|
||||||
|
results map[int]map[int]map[string]interface{}
|
||||||
|
mresults map[int]map[int]map[string]float64
|
||||||
|
gmresults map[int]map[string]float64
|
||||||
|
basefreq float64
|
||||||
}
|
}
|
||||||
|
|
||||||
type LikwidMetric struct {
|
type LikwidMetric struct {
|
||||||
@ -33,7 +62,7 @@ type LikwidMetric struct {
|
|||||||
group_idx int
|
group_idx int
|
||||||
}
|
}
|
||||||
|
|
||||||
const GROUPPATH = `/home/unrz139/Work/cc-metric-collector/collectors/likwid/groups`
|
const GROUPPATH = `/apps/likwid/5.2.0/share/likwid/perfgroups`
|
||||||
|
|
||||||
var likwid_metrics = map[string][]LikwidMetric{
|
var likwid_metrics = map[string][]LikwidMetric{
|
||||||
"MEM_DP": {LikwidMetric{name: "mem_bw", search: "Memory bandwidth [MBytes/s]", socket_scope: true},
|
"MEM_DP": {LikwidMetric{name: "mem_bw", search: "Memory bandwidth [MBytes/s]", socket_scope: true},
|
||||||
@ -57,6 +86,33 @@ func getMetricId(group C.int, search string) (int, error) {
|
|||||||
return -1, errors.New(fmt.Sprintf("Cannot find metric for search string '%s' in group %d", search, int(group)))
|
return -1, errors.New(fmt.Sprintf("Cannot find metric for search string '%s' in group %d", search, int(group)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func eventsToEventStr(events map[string]string) string {
|
||||||
|
elist := make([]string, 0)
|
||||||
|
for k, v := range events {
|
||||||
|
elist = append(elist, fmt.Sprintf("%s:%s", v, k))
|
||||||
|
}
|
||||||
|
return strings.Join(elist, ",")
|
||||||
|
}
|
||||||
|
|
||||||
|
func getBaseFreq() float64 {
|
||||||
|
var freq float64 = math.NaN()
|
||||||
|
C.power_init(0)
|
||||||
|
info := C.get_powerInfo()
|
||||||
|
if float64(info.baseFrequency) != 0 {
|
||||||
|
freq = float64(info.baseFrequency)
|
||||||
|
} else {
|
||||||
|
buffer, err := ioutil.ReadFile("/sys/devices/system/cpu/cpu0/cpufreq/bios_limit")
|
||||||
|
if err == nil {
|
||||||
|
data := strings.Replace(string(buffer), "\n", "", -1)
|
||||||
|
x, err := strconv.ParseInt(data, 0, 64)
|
||||||
|
if err == nil {
|
||||||
|
freq = float64(x)*1E3
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return freq
|
||||||
|
}
|
||||||
|
|
||||||
func getSocketCpus() map[C.int]int {
|
func getSocketCpus() map[C.int]int {
|
||||||
slist := SocketList()
|
slist := SocketList()
|
||||||
var cpu C.int
|
var cpu C.int
|
||||||
@ -71,9 +127,15 @@ func getSocketCpus() map[C.int]int {
|
|||||||
return outmap
|
return outmap
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *LikwidCollector) Init() error {
|
func (m *LikwidCollector) Init(config []byte) error {
|
||||||
var ret C.int
|
var ret C.int
|
||||||
m.name = "LikwidCollector"
|
m.name = "LikwidCollector"
|
||||||
|
if len(config) > 0 {
|
||||||
|
err := json.Unmarshal(config, &m.config)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
m.setup()
|
m.setup()
|
||||||
cpulist := CpuList()
|
cpulist := CpuList()
|
||||||
m.cpulist = make([]C.int, len(cpulist))
|
m.cpulist = make([]C.int, len(cpulist))
|
||||||
@ -86,161 +148,183 @@ func (m *LikwidCollector) Init() error {
|
|||||||
m.sock2tid[sid] = i
|
m.sock2tid[sid] = i
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
m.metrics = make(map[C.int]map[string]int)
|
m.results = make(map[int]map[int]map[string]interface{})
|
||||||
m.groups = make(map[string]C.int)
|
m.mresults = make(map[int]map[int]map[string]float64)
|
||||||
|
m.gmresults = make(map[int]map[string]float64)
|
||||||
ret = C.topology_init()
|
ret = C.topology_init()
|
||||||
if ret != 0 {
|
if ret != 0 {
|
||||||
return errors.New("Failed to initialize LIKWID topology")
|
return errors.New("Failed to initialize LIKWID topology")
|
||||||
}
|
}
|
||||||
|
os.Setenv("LIKWID_FORCE", "1")
|
||||||
ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0])
|
ret = C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0])
|
||||||
if ret != 0 {
|
if ret != 0 {
|
||||||
C.topology_finalize()
|
C.topology_finalize()
|
||||||
return errors.New("Failed to initialize LIKWID topology")
|
return errors.New("Failed to initialize LIKWID topology")
|
||||||
}
|
}
|
||||||
gpath := C.CString(GROUPPATH)
|
|
||||||
C.config_setGroupPath(gpath)
|
|
||||||
C.free(unsafe.Pointer(gpath))
|
|
||||||
|
|
||||||
for g, metrics := range likwid_metrics {
|
for i, evset := range m.config.Eventsets {
|
||||||
cstr := C.CString(g)
|
estr := eventsToEventStr(evset.Events)
|
||||||
gid := C.perfmon_addEventSet(cstr)
|
cstr := C.CString(estr)
|
||||||
if gid >= 0 {
|
gid := C.perfmon_addEventSet(cstr)
|
||||||
gmetrics := 0
|
if gid >= 0 {
|
||||||
for i, metric := range metrics {
|
m.groups = append(m.groups, gid)
|
||||||
idx, err := getMetricId(gid, metric.search)
|
}
|
||||||
if err != nil {
|
C.free(unsafe.Pointer(cstr))
|
||||||
log.Print(err)
|
m.results[i] = make(map[int]map[string]interface{})
|
||||||
} else {
|
m.mresults[i] = make(map[int]map[string]float64)
|
||||||
likwid_metrics[g][i].group_idx = idx
|
for tid, _ := range m.cpulist {
|
||||||
gmetrics++
|
m.results[i][tid] = make(map[string]interface{})
|
||||||
}
|
m.mresults[i][tid] = make(map[string]float64)
|
||||||
}
|
m.gmresults[tid] = make(map[string]float64)
|
||||||
if gmetrics > 0 {
|
}
|
||||||
m.groups[g] = gid
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
log.Print("Failed to add events set ", g)
|
|
||||||
}
|
|
||||||
C.free(unsafe.Pointer(cstr))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(m.groups) == 0 {
|
if len(m.groups) == 0 {
|
||||||
C.perfmon_finalize()
|
C.perfmon_finalize()
|
||||||
C.topology_finalize()
|
C.topology_finalize()
|
||||||
return errors.New("No LIKWID performance group initialized")
|
return errors.New("No LIKWID performance group initialized")
|
||||||
}
|
}
|
||||||
|
m.basefreq = getBaseFreq()
|
||||||
|
log.Print(m.basefreq)
|
||||||
m.init = true
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *LikwidCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
func (m *LikwidCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
||||||
if m.init {
|
if !m.init {
|
||||||
var ret C.int
|
return
|
||||||
core_fp_any := make(map[int]float64, len(m.cpulist))
|
}
|
||||||
for _, cpu := range m.cpulist {
|
var ret C.int
|
||||||
core_fp_any[int(cpu)] = 0.0
|
|
||||||
}
|
|
||||||
for gname, gid := range m.groups {
|
|
||||||
ret = C.perfmon_setupCounters(gid)
|
|
||||||
if ret != 0 {
|
|
||||||
log.Print("Failed to setup performance group ", gname)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
ret = C.perfmon_startCounters()
|
|
||||||
if ret != 0 {
|
|
||||||
log.Print("Failed to start performance group ", gname)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
time.Sleep(interval)
|
|
||||||
ret = C.perfmon_stopCounters()
|
|
||||||
if ret != 0 {
|
|
||||||
log.Print("Failed to stop performance group ", gname)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, lmetric := range likwid_metrics[gname] {
|
for i, gid := range m.groups {
|
||||||
if lmetric.name == "pwr1" || lmetric.name == "pwr2" {
|
evset := m.config.Eventsets[i]
|
||||||
continue
|
ret = C.perfmon_setupCounters(gid)
|
||||||
}
|
if ret != 0 {
|
||||||
mname := lmetric.name
|
log.Print("Failed to setup performance group ", C.perfmon_getGroupName(gid))
|
||||||
inverse := false
|
continue
|
||||||
if mname == "cpi" {
|
}
|
||||||
mname = "ipc"
|
ret = C.perfmon_startCounters()
|
||||||
inverse = true
|
if ret != 0 {
|
||||||
}
|
log.Print("Failed to start performance group ", C.perfmon_getGroupName(gid))
|
||||||
if lmetric.socket_scope {
|
continue
|
||||||
for sid, tid := range m.sock2tid {
|
}
|
||||||
res := C.perfmon_getLastMetric(gid, C.int(lmetric.group_idx), C.int(tid))
|
time.Sleep(interval)
|
||||||
y, err := lp.New(lmetric.name,
|
ret = C.perfmon_stopCounters()
|
||||||
map[string]string{"type": "socket", "type-id": fmt.Sprintf("%d", int(sid))},
|
if ret != 0 {
|
||||||
map[string]interface{}{"value": float64(res)},
|
log.Print("Failed to stop performance group ", C.perfmon_getGroupName(gid))
|
||||||
time.Now())
|
continue
|
||||||
if err == nil {
|
}
|
||||||
*out = append(*out, y)
|
var eidx C.int
|
||||||
}
|
for tid, _ := range m.cpulist {
|
||||||
// log.Print("Metric '", lmetric.name,"' on Socket ",int(sid)," returns ", m.sockets[int(sid)][lmetric.name])
|
for eidx = 0; int(eidx) < len(evset.Events); eidx++ {
|
||||||
}
|
ctr := C.perfmon_getCounterName(gid, eidx)
|
||||||
} else {
|
gctr := C.GoString(ctr)
|
||||||
for tid, cpu := range m.cpulist {
|
res := C.perfmon_getLastResult(gid, eidx, C.int(tid))
|
||||||
res := C.perfmon_getLastMetric(gid, C.int(lmetric.group_idx), C.int(tid))
|
m.results[i][tid][gctr] = float64(res)
|
||||||
value := float64(res)
|
}
|
||||||
if inverse {
|
m.results[i][tid]["time"] = float64(interval)
|
||||||
value = 1.0 / value
|
m.results[i][tid]["inverseClock"] = float64(1.0/m.basefreq)
|
||||||
}
|
for _, metric := range evset.Metrics {
|
||||||
y, err := lp.New(mname,
|
expression, err := govaluate.NewEvaluableExpression(metric.Calc)
|
||||||
map[string]string{"type": "cpu", "type-id": fmt.Sprintf("%d", int(cpu))},
|
if err != nil {
|
||||||
map[string]interface{}{"value": value},
|
log.Print(err.Error())
|
||||||
time.Now())
|
continue
|
||||||
if err == nil {
|
}
|
||||||
*out = append(*out, y)
|
result, err := expression.Evaluate(m.results[i][tid]);
|
||||||
}
|
if err != nil {
|
||||||
if lmetric.name == "flops_dp" {
|
log.Print(err.Error())
|
||||||
core_fp_any[int(cpu)] += 2 * float64(res)
|
continue
|
||||||
}
|
}
|
||||||
if lmetric.name == "flops_sp" {
|
m.mresults[i][tid][metric.Name] = float64(result.(float64))
|
||||||
core_fp_any[int(cpu)] += float64(res)
|
}
|
||||||
}
|
|
||||||
// log.Print("Metric '", lmetric.name,"' on CPU ",int(cpu)," returns ", m.cpus[int(cpu)][lmetric.name])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for sid, tid := range m.sock2tid {
|
|
||||||
sum := 0.0
|
|
||||||
valid := false
|
|
||||||
for _, lmetric := range likwid_metrics[gname] {
|
|
||||||
if lmetric.name == "pwr1" || lmetric.name == "pwr2" {
|
|
||||||
res := C.perfmon_getLastMetric(gid, C.int(lmetric.group_idx), C.int(tid))
|
|
||||||
sum += float64(res)
|
|
||||||
valid = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if valid {
|
|
||||||
y, err := lp.New("power",
|
|
||||||
map[string]string{"type": "socket", "type-id": fmt.Sprintf("%d", int(sid))},
|
|
||||||
map[string]interface{}{"value": float64(sum)},
|
|
||||||
time.Now())
|
|
||||||
if err == nil {
|
|
||||||
*out = append(*out, y)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for cpu := range m.cpulist {
|
|
||||||
y, err := lp.New("flops_any",
|
|
||||||
map[string]string{"type": "cpu", "type-id": fmt.Sprintf("%d", int(cpu))},
|
|
||||||
map[string]interface{}{"value": float64(core_fp_any[int(cpu)])},
|
|
||||||
time.Now())
|
|
||||||
if err == nil {
|
|
||||||
*out = append(*out, y)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for _, metric := range m.config.Metrics {
|
||||||
|
for tid, _ := range m.cpulist {
|
||||||
|
var params map[string]interface{}
|
||||||
|
expression, err := govaluate.NewEvaluableExpression(metric.Calc)
|
||||||
|
if err != nil {
|
||||||
|
log.Print(err.Error())
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
params = make(map[string]interface{})
|
||||||
|
for j, _ := range m.groups {
|
||||||
|
for mname, mres := range m.mresults[j][tid] {
|
||||||
|
params[mname] = mres
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result, err := expression.Evaluate(params);
|
||||||
|
if err != nil {
|
||||||
|
log.Print(err.Error())
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
m.gmresults[tid][metric.Name] = float64(result.(float64))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for i, _ := range m.groups {
|
||||||
|
evset := m.config.Eventsets[i]
|
||||||
|
for _, metric := range evset.Metrics {
|
||||||
|
_, skip := stringArrayContains(m.config.ExcludeMetrics, metric.Name)
|
||||||
|
if metric.Publish && !skip {
|
||||||
|
if metric.Socket_scope {
|
||||||
|
for sid, tid := range m.sock2tid {
|
||||||
|
y, err := lp.New(metric.Name,
|
||||||
|
map[string]string{"type": "socket", "type-id": fmt.Sprintf("%d", int(sid))},
|
||||||
|
map[string]interface{}{"value": m.mresults[i][tid][metric.Name]},
|
||||||
|
time.Now())
|
||||||
|
if err == nil {
|
||||||
|
*out = append(*out, y)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for tid, cpu := range m.cpulist {
|
||||||
|
y, err := lp.New(metric.Name,
|
||||||
|
map[string]string{"type": "cpu", "type-id": fmt.Sprintf("%d", int(cpu))},
|
||||||
|
map[string]interface{}{"value": m.mresults[i][tid][metric.Name]},
|
||||||
|
time.Now())
|
||||||
|
if err == nil {
|
||||||
|
*out = append(*out, y)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, metric := range m.config.Metrics {
|
||||||
|
_, skip := stringArrayContains(m.config.ExcludeMetrics, metric.Name)
|
||||||
|
if metric.Publish && !skip {
|
||||||
|
if metric.Socket_scope {
|
||||||
|
for sid, tid := range m.sock2tid {
|
||||||
|
y, err := lp.New(metric.Name,
|
||||||
|
map[string]string{"type": "socket", "type-id": fmt.Sprintf("%d", int(sid))},
|
||||||
|
map[string]interface{}{"value": m.gmresults[tid][metric.Name]},
|
||||||
|
time.Now())
|
||||||
|
if err == nil {
|
||||||
|
*out = append(*out, y)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for tid, cpu := range m.cpulist {
|
||||||
|
y, err := lp.New(metric.Name,
|
||||||
|
map[string]string{"type": "cpu", "type-id": fmt.Sprintf("%d", int(cpu))},
|
||||||
|
map[string]interface{}{"value": m.gmresults[tid][metric.Name]},
|
||||||
|
time.Now())
|
||||||
|
if err == nil {
|
||||||
|
*out = append(*out, y)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
func (m *LikwidCollector) Close() {
|
func (m *LikwidCollector) Close() {
|
||||||
if m.init {
|
if m.init {
|
||||||
|
m.init = false
|
||||||
C.perfmon_finalize()
|
C.perfmon_finalize()
|
||||||
C.topology_finalize()
|
C.topology_finalize()
|
||||||
m.init = false
|
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
@ -6,20 +6,32 @@ import (
|
|||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
"encoding/json"
|
||||||
)
|
)
|
||||||
|
|
||||||
const LOADAVGFILE = `/proc/loadavg`
|
const LOADAVGFILE = `/proc/loadavg`
|
||||||
|
|
||||||
|
type LoadavgCollectorConfig struct {
|
||||||
|
ExcludeMetrics []string `json:"exclude_metrics, omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
type LoadavgCollector struct {
|
type LoadavgCollector struct {
|
||||||
MetricCollector
|
MetricCollector
|
||||||
tags map[string]string
|
tags map[string]string
|
||||||
load_matches []string
|
load_matches []string
|
||||||
proc_matches []string
|
proc_matches []string
|
||||||
|
config LoadavgCollectorConfig
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *LoadavgCollector) Init() error {
|
func (m *LoadavgCollector) Init(config []byte) error {
|
||||||
m.name = "LoadavgCollector"
|
m.name = "LoadavgCollector"
|
||||||
m.setup()
|
m.setup()
|
||||||
|
if len(config) > 0 {
|
||||||
|
err := json.Unmarshal(config, &m.config)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
m.tags = map[string]string{"type": "node"}
|
m.tags = map[string]string{"type": "node"}
|
||||||
m.load_matches = []string{"load_one", "load_five", "load_fifteen"}
|
m.load_matches = []string{"load_one", "load_five", "load_fifteen"}
|
||||||
m.proc_matches = []string{"proc_run", "proc_total"}
|
m.proc_matches = []string{"proc_run", "proc_total"}
|
||||||
@ -28,7 +40,10 @@ func (m *LoadavgCollector) Init() error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *LoadavgCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
func (m *LoadavgCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
||||||
|
var skip bool
|
||||||
|
if !m.init {
|
||||||
|
return
|
||||||
|
}
|
||||||
buffer, err := ioutil.ReadFile(string(LOADAVGFILE))
|
buffer, err := ioutil.ReadFile(string(LOADAVGFILE))
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -39,8 +54,9 @@ func (m *LoadavgCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
|
|||||||
for i, name := range m.load_matches {
|
for i, name := range m.load_matches {
|
||||||
x, err := strconv.ParseFloat(ls[i], 64)
|
x, err := strconv.ParseFloat(ls[i], 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, name)
|
||||||
y, err := lp.New(name, m.tags, map[string]interface{}{"value": float64(x)}, time.Now())
|
y, err := lp.New(name, m.tags, map[string]interface{}{"value": float64(x)}, time.Now())
|
||||||
if err == nil {
|
if err == nil && !skip {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -49,8 +65,9 @@ func (m *LoadavgCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
|
|||||||
for i, name := range m.proc_matches {
|
for i, name := range m.proc_matches {
|
||||||
x, err := strconv.ParseFloat(lv[i], 64)
|
x, err := strconv.ParseFloat(lv[i], 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, name)
|
||||||
y, err := lp.New(name, m.tags, map[string]interface{}{"value": float64(x)}, time.Now())
|
y, err := lp.New(name, m.tags, map[string]interface{}{"value": float64(x)}, time.Now())
|
||||||
if err == nil {
|
if err == nil && !skip {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -7,18 +7,34 @@ import (
|
|||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
)
|
)
|
||||||
|
|
||||||
const LUSTREFILE = `/proc/fs/lustre/llite/lnec-XXXXXX/stats`
|
const LUSTREFILE = `/proc/fs/lustre/llite/lnec-XXXXXX/stats`
|
||||||
|
|
||||||
|
type LustreCollectorConfig struct {
|
||||||
|
procfiles []string `json:"procfiles"`
|
||||||
|
ExcludeMetrics []string `json:"exclude_metrics"`
|
||||||
|
}
|
||||||
|
|
||||||
type LustreCollector struct {
|
type LustreCollector struct {
|
||||||
MetricCollector
|
MetricCollector
|
||||||
tags map[string]string
|
tags map[string]string
|
||||||
matches map[string]map[string]int
|
matches map[string]map[string]int
|
||||||
|
devices []string
|
||||||
|
config LustreCollectorConfig
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *LustreCollector) Init() error {
|
func (m *LustreCollector) Init(config []byte) error {
|
||||||
|
var err error
|
||||||
m.name = "LustreCollector"
|
m.name = "LustreCollector"
|
||||||
|
if len(config) > 0 {
|
||||||
|
err = json.Unmarshal(config, &m.config)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
m.setup()
|
m.setup()
|
||||||
m.tags = map[string]string{"type": "node"}
|
m.tags = map[string]string{"type": "node"}
|
||||||
m.matches = map[string]map[string]int{"read_bytes": {"read_bytes": 6, "read_requests": 1},
|
m.matches = map[string]map[string]int{"read_bytes": {"read_bytes": 6, "read_requests": 1},
|
||||||
@ -29,38 +45,58 @@ func (m *LustreCollector) Init() error {
|
|||||||
"getattr": {"getattr": 1},
|
"getattr": {"getattr": 1},
|
||||||
"statfs": {"statfs": 1},
|
"statfs": {"statfs": 1},
|
||||||
"inode_permission": {"inode_permission": 1}}
|
"inode_permission": {"inode_permission": 1}}
|
||||||
_, err := ioutil.ReadFile(string(LUSTREFILE))
|
m.devices = make([]string, 0)
|
||||||
if err == nil {
|
for _, p := range m.config.procfiles {
|
||||||
m.init = true
|
_, err := ioutil.ReadFile(p)
|
||||||
|
if err == nil {
|
||||||
|
m.devices = append(m.devices, p)
|
||||||
|
} else {
|
||||||
|
log.Print(err.Error())
|
||||||
|
continue
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return err
|
|
||||||
|
if len(m.devices) == 0 {
|
||||||
|
return errors.New("No metrics to collect")
|
||||||
|
}
|
||||||
|
m.init = true
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *LustreCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
func (m *LustreCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
||||||
buffer, err := ioutil.ReadFile(string(LUSTREFILE))
|
if !m.init {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, p := range m.devices {
|
||||||
|
buffer, err := ioutil.ReadFile(p)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Print(err)
|
log.Print(err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, line := range strings.Split(string(buffer), "\n") {
|
for _, line := range strings.Split(string(buffer), "\n") {
|
||||||
lf := strings.Fields(line)
|
lf := strings.Fields(line)
|
||||||
if len(lf) > 1 {
|
if len(lf) > 1 {
|
||||||
for match, fields := range m.matches {
|
for match, fields := range m.matches {
|
||||||
if lf[0] == match {
|
if lf[0] == match {
|
||||||
for name, idx := range fields {
|
for name, idx := range fields {
|
||||||
x, err := strconv.ParseInt(lf[idx], 0, 64)
|
_, skip := stringArrayContains(m.config.ExcludeMetrics, name)
|
||||||
if err == nil {
|
if skip {
|
||||||
y, err := lp.New(name, m.tags, map[string]interface{}{"value": x}, time.Now())
|
continue
|
||||||
if err == nil {
|
}
|
||||||
*out = append(*out, y)
|
x, err := strconv.ParseInt(lf[idx], 0, 64)
|
||||||
}
|
if err == nil {
|
||||||
}
|
y, err := lp.New(name, m.tags, map[string]interface{}{"value": x}, time.Now())
|
||||||
}
|
if err == nil {
|
||||||
}
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9,22 +9,36 @@ import (
|
|||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
"encoding/json"
|
||||||
)
|
)
|
||||||
|
|
||||||
const MEMSTATFILE = `/proc/meminfo`
|
const MEMSTATFILE = `/proc/meminfo`
|
||||||
|
|
||||||
|
type MemstatCollectorConfig struct {
|
||||||
|
ExcludeMetrics []string `json:"exclude_metrics"`
|
||||||
|
}
|
||||||
|
|
||||||
type MemstatCollector struct {
|
type MemstatCollector struct {
|
||||||
MetricCollector
|
MetricCollector
|
||||||
stats map[string]int64
|
stats map[string]int64
|
||||||
tags map[string]string
|
tags map[string]string
|
||||||
matches map[string]string
|
matches map[string]string
|
||||||
|
config MemstatCollectorConfig
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *MemstatCollector) Init() error {
|
func (m *MemstatCollector) Init(config []byte) error {
|
||||||
|
var err error
|
||||||
m.name = "MemstatCollector"
|
m.name = "MemstatCollector"
|
||||||
|
if len(config) > 0 {
|
||||||
|
err = json.Unmarshal(config, &m.config)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
m.stats = make(map[string]int64)
|
m.stats = make(map[string]int64)
|
||||||
|
m.matches = make(map[string]string)
|
||||||
m.tags = map[string]string{"type": "node"}
|
m.tags = map[string]string{"type": "node"}
|
||||||
m.matches = map[string]string{`MemTotal`: "mem_total",
|
matches := map[string]string{`MemTotal`: "mem_total",
|
||||||
"SwapTotal": "swap_total",
|
"SwapTotal": "swap_total",
|
||||||
"SReclaimable": "mem_sreclaimable",
|
"SReclaimable": "mem_sreclaimable",
|
||||||
"Slab": "mem_slab",
|
"Slab": "mem_slab",
|
||||||
@ -33,8 +47,17 @@ func (m *MemstatCollector) Init() error {
|
|||||||
"Cached": "mem_cached",
|
"Cached": "mem_cached",
|
||||||
"MemAvailable": "mem_available",
|
"MemAvailable": "mem_available",
|
||||||
"SwapFree": "swap_free"}
|
"SwapFree": "swap_free"}
|
||||||
|
for k, v := range matches {
|
||||||
|
_, skip := stringArrayContains(m.config.ExcludeMetrics, k)
|
||||||
|
if (!skip) {
|
||||||
|
m.matches[k] = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(m.matches) == 0 {
|
||||||
|
return errors.New("No metrics to collect")
|
||||||
|
}
|
||||||
m.setup()
|
m.setup()
|
||||||
_, err := ioutil.ReadFile(string(MEMSTATFILE))
|
_, err = ioutil.ReadFile(string(MEMSTATFILE))
|
||||||
if err == nil {
|
if err == nil {
|
||||||
m.init = true
|
m.init = true
|
||||||
}
|
}
|
||||||
@ -42,15 +65,17 @@ func (m *MemstatCollector) Init() error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *MemstatCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
func (m *MemstatCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
||||||
buffer, err := ioutil.ReadFile(string(MEMSTATFILE))
|
if !m.init {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
buffer, err := ioutil.ReadFile(string(MEMSTATFILE))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Print(err)
|
log.Print(err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
ll := strings.Split(string(buffer), "\n")
|
ll := strings.Split(string(buffer), "\n")
|
||||||
|
|
||||||
for _, line := range ll {
|
for _, line := range ll {
|
||||||
ls := strings.Split(line, `:`)
|
ls := strings.Split(line, `:`)
|
||||||
if len(ls) > 1 {
|
if len(ls) > 1 {
|
||||||
@ -81,16 +106,18 @@ func (m *MemstatCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
|
|||||||
if _, buffers := m.stats[`Buffers`]; buffers {
|
if _, buffers := m.stats[`Buffers`]; buffers {
|
||||||
if _, cached := m.stats[`Cached`]; cached {
|
if _, cached := m.stats[`Cached`]; cached {
|
||||||
memUsed := m.stats[`MemTotal`] - (m.stats[`MemFree`] + m.stats[`Buffers`] + m.stats[`Cached`])
|
memUsed := m.stats[`MemTotal`] - (m.stats[`MemFree`] + m.stats[`Buffers`] + m.stats[`Cached`])
|
||||||
|
_, skip := stringArrayContains(m.config.ExcludeMetrics, "mem_used")
|
||||||
y, err := lp.New("mem_used", m.tags, map[string]interface{}{"value": int(float64(memUsed) * 1.0e-3)}, time.Now())
|
y, err := lp.New("mem_used", m.tags, map[string]interface{}{"value": int(float64(memUsed) * 1.0e-3)}, time.Now())
|
||||||
if err == nil {
|
if err == nil && !skip {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if _, found := m.stats[`MemShared`]; found {
|
if _, found := m.stats[`MemShared`]; found {
|
||||||
|
_, skip := stringArrayContains(m.config.ExcludeMetrics, "mem_shared")
|
||||||
y, err := lp.New("mem_shared", m.tags, map[string]interface{}{"value": int(float64(m.stats[`MemShared`]) * 1.0e-3)}, time.Now())
|
y, err := lp.New("mem_shared", m.tags, map[string]interface{}{"value": int(float64(m.stats[`MemShared`]) * 1.0e-3)}, time.Now())
|
||||||
if err == nil {
|
if err == nil && !skip {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -7,11 +7,12 @@ import (
|
|||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
"errors"
|
||||||
)
|
)
|
||||||
|
|
||||||
type MetricGetter interface {
|
type MetricGetter interface {
|
||||||
Name() string
|
Name() string
|
||||||
Init() error
|
Init(config []byte) error
|
||||||
Read(time.Duration, *[]lp.MutableMetric)
|
Read(time.Duration, *[]lp.MutableMetric)
|
||||||
Close()
|
Close()
|
||||||
// GetNodeMetric() map[string]interface{}
|
// GetNodeMetric() map[string]interface{}
|
||||||
@ -67,6 +68,15 @@ func intArrayContains(array []int, str int) (int, bool) {
|
|||||||
return -1, false
|
return -1, false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func stringArrayContains(array []string, str string) (int, bool) {
|
||||||
|
for i, a := range array {
|
||||||
|
if a == str {
|
||||||
|
return i, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return -1, false
|
||||||
|
}
|
||||||
|
|
||||||
func SocketList() []int {
|
func SocketList() []int {
|
||||||
buffer, err := ioutil.ReadFile("/proc/cpuinfo")
|
buffer, err := ioutil.ReadFile("/proc/cpuinfo")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -132,3 +142,13 @@ func Fields2Map(metric lp.Metric) map[string]interface{} {
|
|||||||
}
|
}
|
||||||
return fields
|
return fields
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func RemoveFromStringList(s []string, r string) ([]string, error) {
|
||||||
|
for i, item := range s {
|
||||||
|
if r == item {
|
||||||
|
return append(s[:i], s[i+1:]...), nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return s, errors.New("No such string in list")
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -7,27 +7,36 @@ import (
|
|||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
"encoding/json"
|
||||||
)
|
)
|
||||||
|
|
||||||
const NETSTATFILE = `/proc/net/dev`
|
const NETSTATFILE = `/proc/net/dev`
|
||||||
|
|
||||||
type NetstatCollector struct {
|
type NetstatCollectorConfig struct {
|
||||||
MetricCollector
|
ExcludeDevices []string `json:"exclude_devices, omitempty"`
|
||||||
matches map[int]string
|
|
||||||
tags map[string]string
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *NetstatCollector) Init() error {
|
type NetstatCollector struct {
|
||||||
|
MetricCollector
|
||||||
|
config NetstatCollectorConfig
|
||||||
|
matches map[int]string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *NetstatCollector) Init(config []byte) error {
|
||||||
m.name = "NetstatCollector"
|
m.name = "NetstatCollector"
|
||||||
m.setup()
|
m.setup()
|
||||||
m.tags = map[string]string{"type": "node"}
|
|
||||||
m.matches = map[int]string{
|
m.matches = map[int]string{
|
||||||
1: "bytes_in",
|
1: "bytes_in",
|
||||||
9: "bytes_out",
|
9: "bytes_out",
|
||||||
2: "pkts_in",
|
2: "pkts_in",
|
||||||
10: "pkts_out",
|
10: "pkts_out",
|
||||||
}
|
}
|
||||||
_, err := ioutil.ReadFile(string(NETSTATFILE))
|
err := json.Unmarshal(config, &m.config)
|
||||||
|
if err != nil {
|
||||||
|
log.Print(err.Error())
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
_, err = ioutil.ReadFile(string(NETSTATFILE))
|
||||||
if err == nil {
|
if err == nil {
|
||||||
m.init = true
|
m.init = true
|
||||||
}
|
}
|
||||||
@ -48,13 +57,20 @@ func (m *NetstatCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
|
|||||||
}
|
}
|
||||||
f := strings.Fields(l)
|
f := strings.Fields(l)
|
||||||
dev := f[0][0 : len(f[0])-1]
|
dev := f[0][0 : len(f[0])-1]
|
||||||
if dev == "lo" {
|
cont := false
|
||||||
|
for _, d := range m.config.ExcludeDevices {
|
||||||
|
if d == dev {
|
||||||
|
cont = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if cont {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
tags := map[string]string{"device" : dev, "type": "node"}
|
||||||
for i, name := range m.matches {
|
for i, name := range m.matches {
|
||||||
v, err := strconv.ParseInt(f[i], 10, 0)
|
v, err := strconv.ParseInt(f[i], 10, 0)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
y, err := lp.New(name, m.tags, map[string]interface{}{"value": int(float64(v) * 1.0e-3)}, time.Now())
|
y, err := lp.New(name, tags, map[string]interface{}{"value": int(float64(v) * 1.0e-3)}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
|
@ -7,11 +7,18 @@ import (
|
|||||||
lp "github.com/influxdata/line-protocol"
|
lp "github.com/influxdata/line-protocol"
|
||||||
"log"
|
"log"
|
||||||
"time"
|
"time"
|
||||||
|
"encoding/json"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type NvidiaCollectorConfig struct {
|
||||||
|
ExcludeMetrics []string `json:"exclude_metrics, omitempty"`
|
||||||
|
ExcludeDevices []string `json:"exclude_devices, omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
type NvidiaCollector struct {
|
type NvidiaCollector struct {
|
||||||
MetricCollector
|
MetricCollector
|
||||||
num_gpus int
|
num_gpus int
|
||||||
|
config NvidiaCollectorConfig
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *NvidiaCollector) CatchPanic() error {
|
func (m *NvidiaCollector) CatchPanic() error {
|
||||||
@ -24,20 +31,26 @@ func (m *NvidiaCollector) CatchPanic() error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *NvidiaCollector) Init() error {
|
func (m *NvidiaCollector) Init(config []byte) error {
|
||||||
|
var err error
|
||||||
m.name = "NvidiaCollector"
|
m.name = "NvidiaCollector"
|
||||||
m.setup()
|
m.setup()
|
||||||
|
if len(config) > 0 {
|
||||||
|
err = json.Unmarshal(config, &m.config)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
m.num_gpus = 0
|
m.num_gpus = 0
|
||||||
defer m.CatchPanic()
|
defer m.CatchPanic()
|
||||||
ret := nvml.Init()
|
ret := nvml.Init()
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
err := errors.New(nvml.ErrorString(ret))
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
log.Print(err)
|
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
m.num_gpus, ret = nvml.DeviceGetCount()
|
m.num_gpus, ret = nvml.DeviceGetCount()
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
err := errors.New(nvml.ErrorString(ret))
|
err = errors.New(nvml.ErrorString(ret))
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
m.init = true
|
m.init = true
|
||||||
@ -45,23 +58,31 @@ func (m *NvidiaCollector) Init() error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
||||||
|
if (!m.init) {
|
||||||
|
return
|
||||||
|
}
|
||||||
for i := 0; i < m.num_gpus; i++ {
|
for i := 0; i < m.num_gpus; i++ {
|
||||||
device, ret := nvml.DeviceGetHandleByIndex(i)
|
device, ret := nvml.DeviceGetHandleByIndex(i)
|
||||||
if ret != nvml.SUCCESS {
|
if ret != nvml.SUCCESS {
|
||||||
log.Fatalf("Unable to get device at index %d: %v", i, nvml.ErrorString(ret))
|
log.Fatalf("Unable to get device at index %d: %v", i, nvml.ErrorString(ret))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
_, skip := stringArrayContains(m.config.ExcludeDevices, fmt.Sprintf("%d", i))
|
||||||
|
if skip {
|
||||||
|
continue
|
||||||
|
}
|
||||||
tags := map[string]string{"type": "accelerator", "type-id": fmt.Sprintf("%d", i)}
|
tags := map[string]string{"type": "accelerator", "type-id": fmt.Sprintf("%d", i)}
|
||||||
|
|
||||||
util, ret := nvml.DeviceGetUtilizationRates(device)
|
util, ret := nvml.DeviceGetUtilizationRates(device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "util")
|
||||||
y, err := lp.New("util", tags, map[string]interface{}{"value": float64(util.Gpu)}, time.Now())
|
y, err := lp.New("util", tags, map[string]interface{}{"value": float64(util.Gpu)}, time.Now())
|
||||||
if err == nil {
|
if err == nil && !skip {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "mem_util")
|
||||||
y, err = lp.New("mem_util", tags, map[string]interface{}{"value": float64(util.Memory)}, time.Now())
|
y, err = lp.New("mem_util", tags, map[string]interface{}{"value": float64(util.Memory)}, time.Now())
|
||||||
if err == nil {
|
if err == nil && !skip {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -69,29 +90,33 @@ func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
|
|||||||
meminfo, ret := nvml.DeviceGetMemoryInfo(device)
|
meminfo, ret := nvml.DeviceGetMemoryInfo(device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
t := float64(meminfo.Total) / (1024 * 1024)
|
t := float64(meminfo.Total) / (1024 * 1024)
|
||||||
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "mem_total")
|
||||||
y, err := lp.New("mem_total", tags, map[string]interface{}{"value": t}, time.Now())
|
y, err := lp.New("mem_total", tags, map[string]interface{}{"value": t}, time.Now())
|
||||||
if err == nil {
|
if err == nil && !skip {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
f := float64(meminfo.Used) / (1024 * 1024)
|
f := float64(meminfo.Used) / (1024 * 1024)
|
||||||
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "fb_memory")
|
||||||
y, err = lp.New("fb_memory", tags, map[string]interface{}{"value": f}, time.Now())
|
y, err = lp.New("fb_memory", tags, map[string]interface{}{"value": f}, time.Now())
|
||||||
if err == nil {
|
if err == nil && !skip {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
temp, ret := nvml.DeviceGetTemperature(device, nvml.TEMPERATURE_GPU)
|
temp, ret := nvml.DeviceGetTemperature(device, nvml.TEMPERATURE_GPU)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "temp")
|
||||||
y, err := lp.New("temp", tags, map[string]interface{}{"value": float64(temp)}, time.Now())
|
y, err := lp.New("temp", tags, map[string]interface{}{"value": float64(temp)}, time.Now())
|
||||||
if err == nil {
|
if err == nil && !skip {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fan, ret := nvml.DeviceGetFanSpeed(device)
|
fan, ret := nvml.DeviceGetFanSpeed(device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "fan")
|
||||||
y, err := lp.New("fan", tags, map[string]interface{}{"value": float64(fan)}, time.Now())
|
y, err := lp.New("fan", tags, map[string]interface{}{"value": float64(fan)}, time.Now())
|
||||||
if err == nil {
|
if err == nil && !skip {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -108,116 +133,131 @@ func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric)
|
|||||||
default:
|
default:
|
||||||
y, err = lp.New("ecc_mode", tags, map[string]interface{}{"value": string("UNKNOWN")}, time.Now())
|
y, err = lp.New("ecc_mode", tags, map[string]interface{}{"value": string("UNKNOWN")}, time.Now())
|
||||||
}
|
}
|
||||||
if err == nil {
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_mode")
|
||||||
|
if err == nil && !skip {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
} else if ret == nvml.ERROR_NOT_SUPPORTED {
|
} else if ret == nvml.ERROR_NOT_SUPPORTED {
|
||||||
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_mode")
|
||||||
y, err := lp.New("ecc_mode", tags, map[string]interface{}{"value": string("N/A")}, time.Now())
|
y, err := lp.New("ecc_mode", tags, map[string]interface{}{"value": string("N/A")}, time.Now())
|
||||||
if err == nil {
|
if err == nil && !skip {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pstate, ret := nvml.DeviceGetPerformanceState(device)
|
pstate, ret := nvml.DeviceGetPerformanceState(device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "perf_state")
|
||||||
y, err := lp.New("perf_state", tags, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pstate))}, time.Now())
|
y, err := lp.New("perf_state", tags, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pstate))}, time.Now())
|
||||||
if err == nil {
|
if err == nil && !skip {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
power, ret := nvml.DeviceGetPowerUsage(device)
|
power, ret := nvml.DeviceGetPowerUsage(device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "power_usage_report")
|
||||||
y, err := lp.New("power_usage_report", tags, map[string]interface{}{"value": float64(power) / 1000}, time.Now())
|
y, err := lp.New("power_usage_report", tags, map[string]interface{}{"value": float64(power) / 1000}, time.Now())
|
||||||
if err == nil {
|
if err == nil && !skip {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
gclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_GRAPHICS)
|
gclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_GRAPHICS)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "graphics_clock_report")
|
||||||
y, err := lp.New("graphics_clock_report", tags, map[string]interface{}{"value": float64(gclk)}, time.Now())
|
y, err := lp.New("graphics_clock_report", tags, map[string]interface{}{"value": float64(gclk)}, time.Now())
|
||||||
if err == nil {
|
if err == nil && !skip {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM)
|
smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "sm_clock_report")
|
||||||
y, err := lp.New("sm_clock_report", tags, map[string]interface{}{"value": float64(smclk)}, time.Now())
|
y, err := lp.New("sm_clock_report", tags, map[string]interface{}{"value": float64(smclk)}, time.Now())
|
||||||
if err == nil {
|
if err == nil && !skip {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM)
|
memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "mem_clock_report")
|
||||||
y, err := lp.New("mem_clock_report", tags, map[string]interface{}{"value": float64(memclk)}, time.Now())
|
y, err := lp.New("mem_clock_report", tags, map[string]interface{}{"value": float64(memclk)}, time.Now())
|
||||||
if err == nil {
|
if err == nil && !skip {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
max_gclk, ret := nvml.DeviceGetMaxClockInfo(device, nvml.CLOCK_GRAPHICS)
|
max_gclk, ret := nvml.DeviceGetMaxClockInfo(device, nvml.CLOCK_GRAPHICS)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "max_graphics_clock")
|
||||||
y, err := lp.New("max_graphics_clock", tags, map[string]interface{}{"value": float64(max_gclk)}, time.Now())
|
y, err := lp.New("max_graphics_clock", tags, map[string]interface{}{"value": float64(max_gclk)}, time.Now())
|
||||||
if err == nil {
|
if err == nil && !skip {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
max_smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM)
|
max_smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "max_sm_clock")
|
||||||
y, err := lp.New("max_sm_clock", tags, map[string]interface{}{"value": float64(max_smclk)}, time.Now())
|
y, err := lp.New("max_sm_clock", tags, map[string]interface{}{"value": float64(max_smclk)}, time.Now())
|
||||||
if err == nil {
|
if err == nil && !skip {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
max_memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM)
|
max_memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "max_mem_clock")
|
||||||
y, err := lp.New("max_mem_clock", tags, map[string]interface{}{"value": float64(max_memclk)}, time.Now())
|
y, err := lp.New("max_mem_clock", tags, map[string]interface{}{"value": float64(max_memclk)}, time.Now())
|
||||||
if err == nil {
|
if err == nil && !skip {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ecc_db, ret := nvml.DeviceGetTotalEccErrors(device, 1, 1)
|
ecc_db, ret := nvml.DeviceGetTotalEccErrors(device, 1, 1)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_db_error")
|
||||||
y, err := lp.New("ecc_db_error", tags, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
|
y, err := lp.New("ecc_db_error", tags, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
|
||||||
if err == nil {
|
if err == nil && !skip {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device, 0, 1)
|
ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device, 0, 1)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "ecc_sb_error")
|
||||||
y, err := lp.New("ecc_sb_error", tags, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
|
y, err := lp.New("ecc_sb_error", tags, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
|
||||||
if err == nil {
|
if err == nil && !skip {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device)
|
pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "power_man_limit")
|
||||||
y, err := lp.New("power_man_limit", tags, map[string]interface{}{"value": float64(pwr_limit)}, time.Now())
|
y, err := lp.New("power_man_limit", tags, map[string]interface{}{"value": float64(pwr_limit)}, time.Now())
|
||||||
if err == nil {
|
if err == nil && !skip {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device)
|
enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "encoder_util")
|
||||||
y, err := lp.New("encoder_util", tags, map[string]interface{}{"value": float64(enc_util)}, time.Now())
|
y, err := lp.New("encoder_util", tags, map[string]interface{}{"value": float64(enc_util)}, time.Now())
|
||||||
if err == nil {
|
if err == nil && !skip {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device)
|
dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device)
|
||||||
if ret == nvml.SUCCESS {
|
if ret == nvml.SUCCESS {
|
||||||
|
_, skip = stringArrayContains(m.config.ExcludeMetrics, "decoder_util")
|
||||||
y, err := lp.New("decoder_util", tags, map[string]interface{}{"value": float64(dec_util)}, time.Now())
|
y, err := lp.New("decoder_util", tags, map[string]interface{}{"value": float64(dec_util)}, time.Now())
|
||||||
if err == nil {
|
if err == nil && !skip {
|
||||||
*out = append(*out, y)
|
*out = append(*out, y)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -7,29 +7,50 @@ import (
|
|||||||
"os/exec"
|
"os/exec"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
)
|
)
|
||||||
|
|
||||||
const NUM_PROCS = 5
|
const MAX_NUM_PROCS = 10
|
||||||
|
|
||||||
|
type TopProcsCollectorConfig struct {
|
||||||
|
num_procs int `json:"num_procs"`
|
||||||
|
}
|
||||||
|
|
||||||
type TopProcsCollector struct {
|
type TopProcsCollector struct {
|
||||||
MetricCollector
|
MetricCollector
|
||||||
tags map[string]string
|
tags map[string]string
|
||||||
|
config TopProcsCollectorConfig
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *TopProcsCollector) Init() error {
|
func (m *TopProcsCollector) Init(config []byte) error {
|
||||||
|
var err error
|
||||||
m.name = "TopProcsCollector"
|
m.name = "TopProcsCollector"
|
||||||
m.tags = map[string]string{"type": "node"}
|
m.tags = map[string]string{"type": "node"}
|
||||||
|
if len(config) > 0 {
|
||||||
|
err = json.Unmarshal(config, &m.config)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if m.config.num_procs <= 0 || m.config.num_procs > MAX_NUM_PROCS {
|
||||||
|
return errors.New(fmt.Sprintf("num_procs option must be set in 'topprocs' config (range: 1-%d)", MAX_NUM_PROCS))
|
||||||
|
}
|
||||||
m.setup()
|
m.setup()
|
||||||
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
|
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
|
||||||
command.Wait()
|
command.Wait()
|
||||||
_, err := command.Output()
|
_, err = command.Output()
|
||||||
if err == nil {
|
if err != nil {
|
||||||
m.init = true
|
return errors.New("Failed to execute command")
|
||||||
}
|
}
|
||||||
|
m.init = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *TopProcsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
func (m *TopProcsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
||||||
|
if !m.init {
|
||||||
|
return
|
||||||
|
}
|
||||||
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
|
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
|
||||||
command.Wait()
|
command.Wait()
|
||||||
stdout, err := command.Output()
|
stdout, err := command.Output()
|
||||||
@ -39,7 +60,7 @@ func (m *TopProcsCollector) Read(interval time.Duration, out *[]lp.MutableMetric
|
|||||||
}
|
}
|
||||||
|
|
||||||
lines := strings.Split(string(stdout), "\n")
|
lines := strings.Split(string(stdout), "\n")
|
||||||
for i := 1; i < NUM_PROCS+1; i++ {
|
for i := 1; i < m.config.num_procs+1; i++ {
|
||||||
name := fmt.Sprintf("topproc%d", i)
|
name := fmt.Sprintf("topproc%d", i)
|
||||||
y, err := lp.New(name, m.tags, map[string]interface{}{"value": string(lines[i])}, time.Now())
|
y, err := lp.New(name, m.tags, map[string]interface{}{"value": string(lines[i])}, time.Now())
|
||||||
if err == nil {
|
if err == nil {
|
||||||
|
162
config.json
162
config.json
@ -1,30 +1,142 @@
|
|||||||
{
|
{
|
||||||
"sink": {
|
"sink": {
|
||||||
"user": "testuser",
|
"user": "testuser",
|
||||||
"password": "testpass",
|
"password": "testpass",
|
||||||
"host": "127.0.0.1",
|
"host": "127.0.0.1",
|
||||||
"port": "9090",
|
"port": "9090",
|
||||||
"database": "testdb",
|
"database": "testdb",
|
||||||
"organization": "testorg",
|
"organization": "testorg",
|
||||||
"type": "stdout"
|
"type": "stdout"
|
||||||
|
},
|
||||||
|
"interval": 3,
|
||||||
|
"duration": 1,
|
||||||
|
"collectors": [
|
||||||
|
"loadavg",
|
||||||
|
"likwid"
|
||||||
|
],
|
||||||
|
"default_tags": {
|
||||||
|
"cluster": "testcluster"
|
||||||
|
},
|
||||||
|
"receiver": {
|
||||||
|
"type": "none"
|
||||||
|
},
|
||||||
|
"collect_config": {
|
||||||
|
"netstat": {
|
||||||
|
"exclude_devices": [
|
||||||
|
"enp195s0f1",
|
||||||
|
"lo"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"interval" : 3,
|
"ibstat": {
|
||||||
"duration" : 1,
|
"exclude_devices": [
|
||||||
"collectors": [
|
"mlx5_0",
|
||||||
"cpustat",
|
"mlx5_1"
|
||||||
"loadavg",
|
]
|
||||||
"memstat",
|
|
||||||
"netstat",
|
|
||||||
"topprocs",
|
|
||||||
"lustrestat",
|
|
||||||
"ibstat",
|
|
||||||
"nvidia",
|
|
||||||
"likwid"
|
|
||||||
],
|
|
||||||
"default_tags": {
|
|
||||||
"cluster": "testcluster"
|
|
||||||
},
|
},
|
||||||
"receiver": {
|
"cpustat": {
|
||||||
"type": "none"
|
"exclude_metrics": [
|
||||||
|
"cpu_softirq"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"diskstat": {
|
||||||
|
"exclude_metrics": [
|
||||||
|
"writes_merged"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"nvidia": {
|
||||||
|
"exclude_metrics": [
|
||||||
|
"util"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"loadavg": {
|
||||||
|
"exclude_metrics": [
|
||||||
|
"load_fifteen"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"likwid": {
|
||||||
|
"exclude_metrics": [
|
||||||
|
"load_fifteen"
|
||||||
|
],
|
||||||
|
"eventsets": [
|
||||||
|
{
|
||||||
|
"events": {
|
||||||
|
"FIXC1": "ACTUAL_CPU_CLOCK",
|
||||||
|
"FIXC2": "MAX_CPU_CLOCK",
|
||||||
|
"PMC0": "RETIRED_INSTRUCTIONS",
|
||||||
|
"PMC1": "CPU_CLOCKS_UNHALTED",
|
||||||
|
"PMC2": "RETIRED_SSE_AVX_FLOPS_ALL",
|
||||||
|
"PMC3": "MERGE",
|
||||||
|
"DFC0": "DRAM_CHANNEL_0",
|
||||||
|
"DFC1": "DRAM_CHANNEL_1",
|
||||||
|
"DFC2": "DRAM_CHANNEL_2",
|
||||||
|
"DFC3": "DRAM_CHANNEL_3"
|
||||||
|
},
|
||||||
|
"metrics": [
|
||||||
|
{
|
||||||
|
"name": "ipc",
|
||||||
|
"calc": "PMC0/PMC1",
|
||||||
|
"socket_scope": false,
|
||||||
|
"publish": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "flops_any",
|
||||||
|
"calc": "0.000001*PMC2/time",
|
||||||
|
"socket_scope": false,
|
||||||
|
"publish": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "clock_mhz",
|
||||||
|
"calc": "0.000001*(FIXC1/FIXC2)/inverseClock",
|
||||||
|
"socket_scope": false,
|
||||||
|
"publish": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "mem1",
|
||||||
|
"calc": "0.000001*(DFC0+DFC1+DFC2+DFC3)*64.0/time",
|
||||||
|
"socket_scope": true,
|
||||||
|
"publish": false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"events": {
|
||||||
|
"DFC0": "DRAM_CHANNEL_4",
|
||||||
|
"DFC1": "DRAM_CHANNEL_5",
|
||||||
|
"DFC2": "DRAM_CHANNEL_6",
|
||||||
|
"DFC3": "DRAM_CHANNEL_7",
|
||||||
|
"PWR0": "RAPL_CORE_ENERGY",
|
||||||
|
"PWR1": "RAPL_PKG_ENERGY"
|
||||||
|
},
|
||||||
|
"metrics": [
|
||||||
|
{
|
||||||
|
"name": "pwr_core",
|
||||||
|
"calc": "PWR0/time",
|
||||||
|
"socket_scope": false,
|
||||||
|
"publish": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "pwr_pkg",
|
||||||
|
"calc": "PWR1/time",
|
||||||
|
"socket_scope": true,
|
||||||
|
"publish": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "mem2",
|
||||||
|
"calc": "0.000001*(DFC0+DFC1+DFC2+DFC3)*64.0/time",
|
||||||
|
"socket_scope": true,
|
||||||
|
"publish": false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"globalmetrics": [
|
||||||
|
{
|
||||||
|
"name": "mem_bw",
|
||||||
|
"calc": "mem1+mem2",
|
||||||
|
"socket_scope": true,
|
||||||
|
"publish": true
|
||||||
|
}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -51,6 +51,7 @@ type GlobalConfig struct {
|
|||||||
Collectors []string `json:"collectors"`
|
Collectors []string `json:"collectors"`
|
||||||
Receiver receivers.ReceiverConfig `json:"receiver"`
|
Receiver receivers.ReceiverConfig `json:"receiver"`
|
||||||
DefTags map[string]string `json:"default_tags"`
|
DefTags map[string]string `json:"default_tags"`
|
||||||
|
CollectConfigs map[string]json.RawMessage `json:"collect_config"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load JSON configuration file
|
// Load JSON configuration file
|
||||||
@ -62,7 +63,7 @@ func LoadConfiguration(file string, config *GlobalConfig) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
jsonParser := json.NewDecoder(configFile)
|
jsonParser := json.NewDecoder(configFile)
|
||||||
jsonParser.Decode(config)
|
err = jsonParser.Decode(config)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -71,11 +72,17 @@ func ReadCli() map[string]string {
|
|||||||
cfg := flag.String("config", "./config.json", "Path to configuration file")
|
cfg := flag.String("config", "./config.json", "Path to configuration file")
|
||||||
logfile := flag.String("log", "stderr", "Path for logfile")
|
logfile := flag.String("log", "stderr", "Path for logfile")
|
||||||
pidfile := flag.String("pidfile", "/var/run/cc-metric-collector.pid", "Path for PID file")
|
pidfile := flag.String("pidfile", "/var/run/cc-metric-collector.pid", "Path for PID file")
|
||||||
|
once := flag.Bool("once", false, "Run all collectors only once")
|
||||||
flag.Parse()
|
flag.Parse()
|
||||||
m = make(map[string]string)
|
m = make(map[string]string)
|
||||||
m["configfile"] = *cfg
|
m["configfile"] = *cfg
|
||||||
m["logfile"] = *logfile
|
m["logfile"] = *logfile
|
||||||
m["pidfile"] = *pidfile
|
m["pidfile"] = *pidfile
|
||||||
|
if *once {
|
||||||
|
m["once"] = "true"
|
||||||
|
} else {
|
||||||
|
m["once"] = "false"
|
||||||
|
}
|
||||||
return m
|
return m
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -114,27 +121,33 @@ func RemovePidfile(pidfile string) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// General shutdown function that gets executed in case of interrupt or graceful shutdown
|
||||||
|
func shutdown(wg *sync.WaitGroup, collectors []string, sink sinks.SinkFuncs, recv receivers.ReceiverFuncs, pidfile string) {
|
||||||
|
log.Print("Shutdown...")
|
||||||
|
for _, c := range collectors {
|
||||||
|
col := Collectors[c]
|
||||||
|
log.Print("Stop ", col.Name())
|
||||||
|
col.Close()
|
||||||
|
}
|
||||||
|
time.Sleep(1 * time.Second)
|
||||||
|
if recv != nil {
|
||||||
|
recv.Close()
|
||||||
|
}
|
||||||
|
sink.Close()
|
||||||
|
RemovePidfile(pidfile)
|
||||||
|
wg.Done()
|
||||||
|
}
|
||||||
|
|
||||||
// Register an interrupt handler for Ctrl+C and similar. At signal,
|
// Register an interrupt handler for Ctrl+C and similar. At signal,
|
||||||
// all collectors are closed
|
// all collectors are closed
|
||||||
func shutdown(wg *sync.WaitGroup, config *GlobalConfig, sink sinks.SinkFuncs, recv receivers.ReceiverFuncs, pidfile string) {
|
func prepare_shutdown(wg *sync.WaitGroup, config *GlobalConfig, sink sinks.SinkFuncs, recv receivers.ReceiverFuncs, pidfile string) {
|
||||||
sigs := make(chan os.Signal, 1)
|
sigs := make(chan os.Signal, 1)
|
||||||
signal.Notify(sigs, os.Interrupt)
|
signal.Notify(sigs, os.Interrupt)
|
||||||
|
|
||||||
go func(wg *sync.WaitGroup) {
|
go func(wg *sync.WaitGroup) {
|
||||||
<-sigs
|
<-sigs
|
||||||
log.Print("Shutdown...")
|
log.Print("Shutdown...")
|
||||||
for _, c := range config.Collectors {
|
shutdown(wg, config.Collectors, sink, recv, pidfile)
|
||||||
col := Collectors[c]
|
|
||||||
log.Print("Stop ", col.Name())
|
|
||||||
col.Close()
|
|
||||||
}
|
|
||||||
time.Sleep(1 * time.Second)
|
|
||||||
if recv != nil {
|
|
||||||
recv.Close()
|
|
||||||
}
|
|
||||||
sink.Close()
|
|
||||||
RemovePidfile(pidfile)
|
|
||||||
wg.Done()
|
|
||||||
}(wg)
|
}(wg)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -162,6 +175,7 @@ func main() {
|
|||||||
err = LoadConfiguration(clicfg["configfile"], &config)
|
err = LoadConfiguration(clicfg["configfile"], &config)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Print("Error reading configuration file ", clicfg["configfile"])
|
log.Print("Error reading configuration file ", clicfg["configfile"])
|
||||||
|
log.Print(err.Error())
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if config.Interval <= 0 || time.Duration(config.Interval)*time.Second <= 0 {
|
if config.Interval <= 0 || time.Duration(config.Interval)*time.Second <= 0 {
|
||||||
@ -214,15 +228,19 @@ func main() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Register interrupt handler
|
// Register interrupt handler
|
||||||
shutdown(&wg, &config, sink, recv, clicfg["pidfile"])
|
prepare_shutdown(&wg, &config, sink, recv, clicfg["pidfile"])
|
||||||
|
|
||||||
// Initialize all collectors
|
// Initialize all collectors
|
||||||
tmp := make([]string, 0)
|
tmp := make([]string, 0)
|
||||||
for _, c := range config.Collectors {
|
for _, c := range config.Collectors {
|
||||||
col := Collectors[c]
|
col := Collectors[c]
|
||||||
err = col.Init()
|
conf, found := config.CollectConfigs[c]
|
||||||
|
if !found {
|
||||||
|
conf = json.RawMessage("")
|
||||||
|
}
|
||||||
|
err = col.Init([]byte(conf))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Print("SKIP ", col.Name())
|
log.Print("SKIP ", col.Name(), " (", err.Error(),")")
|
||||||
} else {
|
} else {
|
||||||
log.Print("Start ", col.Name())
|
log.Print("Start ", col.Name())
|
||||||
tmp = append(tmp, c)
|
tmp = append(tmp, c)
|
||||||
@ -232,7 +250,11 @@ func main() {
|
|||||||
config.DefTags["hostname"] = host
|
config.DefTags["hostname"] = host
|
||||||
|
|
||||||
// Setup up ticker loop
|
// Setup up ticker loop
|
||||||
log.Print("Running loop every ", time.Duration(config.Interval)*time.Second)
|
if clicfg["once"] != "true" {
|
||||||
|
log.Print("Running loop every ", time.Duration(config.Interval)*time.Second)
|
||||||
|
} else {
|
||||||
|
log.Print("Running loop only once")
|
||||||
|
}
|
||||||
ticker := time.NewTicker(time.Duration(config.Interval) * time.Second)
|
ticker := time.NewTicker(time.Duration(config.Interval) * time.Second)
|
||||||
done := make(chan bool)
|
done := make(chan bool)
|
||||||
|
|
||||||
@ -274,6 +296,10 @@ func main() {
|
|||||||
if err := sink.Flush(); err != nil {
|
if err := sink.Flush(); err != nil {
|
||||||
log.Printf("sink error: %s\n", err)
|
log.Printf("sink error: %s\n", err)
|
||||||
}
|
}
|
||||||
|
if clicfg["once"] == "true" {
|
||||||
|
shutdown(&wg, config.Collectors, sink, recv, clicfg["pidfile"])
|
||||||
|
return
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
Loading…
Reference in New Issue
Block a user