mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-07-19 11:21:41 +02:00
Change storage format
This commit is contained in:
@@ -1,6 +1,8 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
lp "github.com/influxdata/line-protocol"
|
||||
"io/ioutil"
|
||||
"strconv"
|
||||
"strings"
|
||||
@@ -16,34 +18,34 @@ type CpustatCollector struct {
|
||||
func (m *CpustatCollector) Init() error {
|
||||
m.name = "CpustatCollector"
|
||||
m.setup()
|
||||
m.init = true
|
||||
return nil
|
||||
}
|
||||
|
||||
func ParseStatLine(line string, out map[string]interface{}) {
|
||||
func ParseStatLine(line string, cpu int, out *[]lp.MutableMetric) {
|
||||
ls := strings.Fields(line)
|
||||
user, _ := strconv.ParseInt(ls[1], 0, 64)
|
||||
out["cpu_user"] = float64(user)
|
||||
nice, _ := strconv.ParseInt(ls[2], 0, 64)
|
||||
out["cpu_nice"] = float64(nice)
|
||||
system, _ := strconv.ParseInt(ls[3], 0, 64)
|
||||
out["cpu_system"] = float64(system)
|
||||
idle, _ := strconv.ParseInt(ls[4], 0, 64)
|
||||
out["cpu_idle"] = float64(idle)
|
||||
iowait, _ := strconv.ParseInt(ls[5], 0, 64)
|
||||
out["cpu_iowait"] = float64(iowait)
|
||||
irq, _ := strconv.ParseInt(ls[6], 0, 64)
|
||||
out["cpu_irq"] = float64(irq)
|
||||
softirq, _ := strconv.ParseInt(ls[7], 0, 64)
|
||||
out["cpu_softirq"] = float64(softirq)
|
||||
steal, _ := strconv.ParseInt(ls[8], 0, 64)
|
||||
out["cpu_steal"] = float64(steal)
|
||||
guest, _ := strconv.ParseInt(ls[9], 0, 64)
|
||||
out["cpu_guest"] = float64(guest)
|
||||
guest_nice, _ := strconv.ParseInt(ls[10], 0, 64)
|
||||
out["cpu_guest_nice"] = float64(guest_nice)
|
||||
matches := []string{"", "cpu_user", "cpu_nice", "cpu_system", "cpu_idle", "cpu_iowait", "cpu_irq", "cpu_softirq", "cpu_steal", "cpu_guest", "cpu_guest_nice"}
|
||||
|
||||
var tags map[string]string
|
||||
if cpu < 0 {
|
||||
tags = map[string]string{"type": "node"}
|
||||
} else {
|
||||
tags = map[string]string{"type": "cpu", "type-id": fmt.Sprintf("%d", cpu)}
|
||||
}
|
||||
for i, m := range matches {
|
||||
if len(m) > 0 {
|
||||
x, err := strconv.ParseInt(ls[i], 0, 64)
|
||||
if err == nil {
|
||||
y, err := lp.New(m, tags, map[string]interface{}{"value": int(x)}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (m *CpustatCollector) Read(interval time.Duration) {
|
||||
func (m *CpustatCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
||||
buffer, err := ioutil.ReadFile(string(CPUSTATFILE))
|
||||
|
||||
if err != nil {
|
||||
@@ -57,15 +59,16 @@ func (m *CpustatCollector) Read(interval time.Duration) {
|
||||
}
|
||||
ls := strings.Fields(line)
|
||||
if strings.Compare(ls[0], "cpu") == 0 {
|
||||
ParseStatLine(line, m.node)
|
||||
ParseStatLine(line, -1, out)
|
||||
} else if strings.HasPrefix(ls[0], "cpu") {
|
||||
cpustr := strings.TrimLeft(ls[0], "cpu")
|
||||
cpu, _ := strconv.Atoi(cpustr)
|
||||
ParseStatLine(line, m.cpus[cpu])
|
||||
ParseStatLine(line, cpu, out)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (m *CpustatCollector) Close() {
|
||||
m.init = false
|
||||
return
|
||||
}
|
||||
|
@@ -2,6 +2,7 @@ package collectors
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
lp "github.com/influxdata/line-protocol"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"os/exec"
|
||||
@@ -14,16 +15,21 @@ const LIDFILE = `/sys/class/infiniband/mlx4_0/ports/1/lid`
|
||||
|
||||
type InfinibandCollector struct {
|
||||
MetricCollector
|
||||
tags map[string]string
|
||||
}
|
||||
|
||||
func (m *InfinibandCollector) Init() error {
|
||||
m.name = "InfinibandCollector"
|
||||
m.setup()
|
||||
m.tags = map[string]string{"type": "node"}
|
||||
_, err := ioutil.ReadFile(string(LIDFILE))
|
||||
if err == nil {
|
||||
m.init = true
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
func (m *InfinibandCollector) Read(interval time.Duration) {
|
||||
func (m *InfinibandCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
||||
buffer, err := ioutil.ReadFile(string(LIDFILE))
|
||||
|
||||
if err != nil {
|
||||
@@ -48,19 +54,26 @@ func (m *InfinibandCollector) Read(interval time.Duration) {
|
||||
lv := strings.Fields(line)
|
||||
v, err := strconv.ParseFloat(lv[1], 64)
|
||||
if err == nil {
|
||||
m.node["ib_recv"] = float64(v)
|
||||
y, err := lp.New("ib_recv", m.tags, map[string]interface{}{"value": float64(v)}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
}
|
||||
if strings.HasPrefix(line, "PortXmitData") || strings.HasPrefix(line, "XmtData") {
|
||||
lv := strings.Fields(line)
|
||||
v, err := strconv.ParseFloat(lv[1], 64)
|
||||
if err == nil {
|
||||
m.node["ib_xmit"] = float64(v)
|
||||
y, err := lp.New("ib_xmit", m.tags, map[string]interface{}{"value": float64(v)}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (m *InfinibandCollector) Close() {
|
||||
m.init = false
|
||||
return
|
||||
}
|
||||
|
@@ -11,6 +11,7 @@ import "C"
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
lp "github.com/influxdata/line-protocol"
|
||||
"log"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -23,7 +24,6 @@ type LikwidCollector struct {
|
||||
sock2tid map[int]int
|
||||
metrics map[C.int]map[string]int
|
||||
groups map[string]C.int
|
||||
init bool
|
||||
}
|
||||
|
||||
type LikwidMetric struct {
|
||||
@@ -132,9 +132,13 @@ func (m *LikwidCollector) Init() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *LikwidCollector) Read(interval time.Duration) {
|
||||
func (m *LikwidCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
||||
if m.init {
|
||||
var ret C.int
|
||||
core_fp_any := make(map[int]float64, len(m.cpulist))
|
||||
for _, cpu := range m.cpulist {
|
||||
core_fp_any[int(cpu)] = 0.0
|
||||
}
|
||||
for gname, gid := range m.groups {
|
||||
ret = C.perfmon_setupCounters(gid)
|
||||
if ret != 0 {
|
||||
@@ -154,37 +158,68 @@ func (m *LikwidCollector) Read(interval time.Duration) {
|
||||
}
|
||||
|
||||
for _, lmetric := range likwid_metrics[gname] {
|
||||
if lmetric.name == "pwr1" || lmetric.name == "pwr2" {
|
||||
continue
|
||||
}
|
||||
if lmetric.socket_scope {
|
||||
for sid, tid := range m.sock2tid {
|
||||
res := C.perfmon_getLastMetric(gid, C.int(lmetric.group_idx), C.int(tid))
|
||||
m.sockets[int(sid)][lmetric.name] = float64(res)
|
||||
y, err := lp.New(lmetric.name,
|
||||
map[string]string{"type": "socket", "type-id": fmt.Sprintf("%d", int(sid))},
|
||||
map[string]interface{}{"value": float64(res)},
|
||||
time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
// log.Print("Metric '", lmetric.name,"' on Socket ",int(sid)," returns ", m.sockets[int(sid)][lmetric.name])
|
||||
}
|
||||
} else {
|
||||
for tid, cpu := range m.cpulist {
|
||||
res := C.perfmon_getLastMetric(gid, C.int(lmetric.group_idx), C.int(tid))
|
||||
m.cpus[int(cpu)][lmetric.name] = float64(res)
|
||||
y, err := lp.New(lmetric.name,
|
||||
map[string]string{"type": "cpu", "type-id": fmt.Sprintf("%d", int(cpu))},
|
||||
map[string]interface{}{"value": float64(res)},
|
||||
time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
if lmetric.name == "flops_dp" {
|
||||
core_fp_any[int(cpu)] += 2 * float64(res)
|
||||
}
|
||||
if lmetric.name == "flops_sp" {
|
||||
core_fp_any[int(cpu)] += float64(res)
|
||||
}
|
||||
// log.Print("Metric '", lmetric.name,"' on CPU ",int(cpu)," returns ", m.cpus[int(cpu)][lmetric.name])
|
||||
}
|
||||
}
|
||||
}
|
||||
for cpu := range m.cpus {
|
||||
if flops_dp, found := m.cpus[cpu]["flops_dp"]; found {
|
||||
if flops_sp, found := m.cpus[cpu]["flops_sp"]; found {
|
||||
m.cpus[cpu]["flops_any"] = (2 * flops_dp.(float64)) + flops_sp.(float64)
|
||||
for sid, tid := range m.sock2tid {
|
||||
sum := 0.0
|
||||
valid := false
|
||||
for _, lmetric := range likwid_metrics[gname] {
|
||||
if lmetric.name == "pwr1" || lmetric.name == "pwr2" {
|
||||
res := C.perfmon_getLastMetric(gid, C.int(lmetric.group_idx), C.int(tid))
|
||||
sum += float64(res)
|
||||
valid = true
|
||||
}
|
||||
}
|
||||
if valid {
|
||||
y, err := lp.New("power",
|
||||
map[string]string{"type": "socket", "type-id": fmt.Sprintf("%d", int(sid))},
|
||||
map[string]interface{}{"value": float64(sum)},
|
||||
time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
}
|
||||
for sid := range m.sockets {
|
||||
if pwr1, found := m.sockets[int(sid)]["pwr1"]; found {
|
||||
if pwr2, found := m.sockets[int(sid)]["pwr2"]; found {
|
||||
sum := pwr1.(float64) + pwr2.(float64)
|
||||
if sum > 0 {
|
||||
m.sockets[int(sid)]["power"] = sum
|
||||
}
|
||||
delete(m.sockets[int(sid)], "pwr2")
|
||||
}
|
||||
delete(m.sockets[int(sid)], "pwr1")
|
||||
for cpu := range m.cpulist {
|
||||
y, err := lp.New("flops_any",
|
||||
map[string]string{"type": "cpu", "type-id": fmt.Sprintf("%d", int(cpu))},
|
||||
map[string]interface{}{"value": float64(core_fp_any[int(cpu)])},
|
||||
time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
lp "github.com/influxdata/line-protocol"
|
||||
"io/ioutil"
|
||||
"strconv"
|
||||
"strings"
|
||||
@@ -11,15 +12,23 @@ const LOADAVGFILE = `/proc/loadavg`
|
||||
|
||||
type LoadavgCollector struct {
|
||||
MetricCollector
|
||||
tags map[string]string
|
||||
load_matches []string
|
||||
proc_matches []string
|
||||
}
|
||||
|
||||
func (m *LoadavgCollector) Init() error {
|
||||
m.name = "LoadavgCollector"
|
||||
m.setup()
|
||||
m.tags = map[string]string{"type": "node"}
|
||||
m.load_matches = []string{"load_one", "load_five", "load_fifteen"}
|
||||
m.proc_matches = []string{"proc_run", "proc_total"}
|
||||
m.init = true
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *LoadavgCollector) Read(interval time.Duration) {
|
||||
func (m *LoadavgCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
||||
|
||||
buffer, err := ioutil.ReadFile(string(LOADAVGFILE))
|
||||
|
||||
if err != nil {
|
||||
@@ -27,19 +36,28 @@ func (m *LoadavgCollector) Read(interval time.Duration) {
|
||||
}
|
||||
|
||||
ls := strings.Split(string(buffer), ` `)
|
||||
loadOne, _ := strconv.ParseFloat(ls[0], 64)
|
||||
m.node["load_one"] = float64(loadOne)
|
||||
loadFive, _ := strconv.ParseFloat(ls[1], 64)
|
||||
m.node["load_five"] = float64(loadFive)
|
||||
loadFifteen, _ := strconv.ParseFloat(ls[2], 64)
|
||||
m.node["load_fifteen"] = float64(loadFifteen)
|
||||
for i, name := range m.load_matches {
|
||||
x, err := strconv.ParseFloat(ls[i], 64)
|
||||
if err == nil {
|
||||
y, err := lp.New(name, m.tags, map[string]interface{}{"value": float64(x)}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
}
|
||||
lv := strings.Split(ls[3], `/`)
|
||||
proc_run, _ := strconv.ParseFloat(lv[0], 64)
|
||||
proc_total, _ := strconv.ParseFloat(lv[1], 64)
|
||||
m.node["proc_total"] = float64(proc_total)
|
||||
m.node["proc_run"] = float64(proc_run)
|
||||
for i, name := range m.proc_matches {
|
||||
x, err := strconv.ParseFloat(lv[i], 64)
|
||||
if err == nil {
|
||||
y, err := lp.New(name, m.tags, map[string]interface{}{"value": float64(x)}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (m *LoadavgCollector) Close() {
|
||||
m.init = false
|
||||
return
|
||||
}
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
lp "github.com/influxdata/line-protocol"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"strconv"
|
||||
@@ -12,16 +13,30 @@ const LUSTREFILE = `/proc/fs/lustre/llite/lnec-XXXXXX/stats`
|
||||
|
||||
type LustreCollector struct {
|
||||
MetricCollector
|
||||
tags map[string]string
|
||||
matches map[string]map[string]int
|
||||
}
|
||||
|
||||
func (m *LustreCollector) Init() error {
|
||||
m.name = "LustreCollector"
|
||||
m.setup()
|
||||
m.tags = map[string]string{"type": "node"}
|
||||
m.matches = map[string]map[string]int{"read_bytes": {"read_bytes": 6, "read_requests": 1},
|
||||
"write_bytes": {"write_bytes": 6, "write_requests": 1},
|
||||
"open": {"open": 1},
|
||||
"close": {"close": 1},
|
||||
"setattr": {"setattr": 1},
|
||||
"getattr": {"getattr": 1},
|
||||
"statfs": {"statfs": 1},
|
||||
"inode_permission": {"inode_permission": 1}}
|
||||
_, err := ioutil.ReadFile(string(LUSTREFILE))
|
||||
if err == nil {
|
||||
m.init = true
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
func (m *LustreCollector) Read(interval time.Duration) {
|
||||
func (m *LustreCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
||||
buffer, err := ioutil.ReadFile(string(LUSTREFILE))
|
||||
|
||||
if err != nil {
|
||||
@@ -32,31 +47,24 @@ func (m *LustreCollector) Read(interval time.Duration) {
|
||||
for _, line := range strings.Split(string(buffer), "\n") {
|
||||
lf := strings.Fields(line)
|
||||
if len(lf) > 1 {
|
||||
switch lf[0] {
|
||||
case "read_bytes":
|
||||
m.node["read_bytes"], err = strconv.ParseInt(lf[6], 0, 64)
|
||||
m.node["read_requests"], err = strconv.ParseInt(lf[1], 0, 64)
|
||||
case "write_bytes":
|
||||
m.node["write_bytes"], err = strconv.ParseInt(lf[6], 0, 64)
|
||||
m.node["write_requests"], err = strconv.ParseInt(lf[1], 0, 64)
|
||||
case "open":
|
||||
m.node["open"], err = strconv.ParseInt(lf[1], 0, 64)
|
||||
case "close":
|
||||
m.node["close"], err = strconv.ParseInt(lf[1], 0, 64)
|
||||
case "setattr":
|
||||
m.node["setattr"], err = strconv.ParseInt(lf[1], 0, 64)
|
||||
case "getattr":
|
||||
m.node["getattr"], err = strconv.ParseInt(lf[1], 0, 64)
|
||||
case "statfs":
|
||||
m.node["statfs"], err = strconv.ParseInt(lf[1], 0, 64)
|
||||
case "inode_permission":
|
||||
m.node["inode_permission"], err = strconv.ParseInt(lf[1], 0, 64)
|
||||
for match, fields := range m.matches {
|
||||
if lf[0] == match {
|
||||
for name, idx := range fields {
|
||||
x, err := strconv.ParseInt(lf[idx], 0, 64)
|
||||
if err == nil {
|
||||
y, err := lp.New(name, m.tags, map[string]interface{}{"value": x}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (m *LustreCollector) Close() {
|
||||
m.init = false
|
||||
return
|
||||
}
|
||||
|
@@ -2,6 +2,8 @@ package collectors
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
lp "github.com/influxdata/line-protocol"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"strconv"
|
||||
@@ -13,15 +15,33 @@ const MEMSTATFILE = `/proc/meminfo`
|
||||
|
||||
type MemstatCollector struct {
|
||||
MetricCollector
|
||||
stats map[string]int64
|
||||
tags map[string]string
|
||||
matches map[string]string
|
||||
}
|
||||
|
||||
func (m *MemstatCollector) Init() error {
|
||||
m.name = "MemstatCollector"
|
||||
m.stats = make(map[string]int64)
|
||||
m.tags = map[string]string{"type": "node"}
|
||||
m.matches = map[string]string{`MemTotal`: "mem_total",
|
||||
"SwapTotal": "swap_total",
|
||||
"SReclaimable": "mem_sreclaimable",
|
||||
"Slab": "mem_slab",
|
||||
"MemFree": "mem_free",
|
||||
"Buffers": "mem_buffers",
|
||||
"Cached": "mem_cached",
|
||||
"MemAvailable": "mem_available",
|
||||
"SwapFree": "swap_free"}
|
||||
m.setup()
|
||||
_, err := ioutil.ReadFile(string(MEMSTATFILE))
|
||||
if err == nil {
|
||||
m.init = true
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *MemstatCollector) Read(interval time.Duration) {
|
||||
func (m *MemstatCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
||||
buffer, err := ioutil.ReadFile(string(MEMSTATFILE))
|
||||
|
||||
if err != nil {
|
||||
@@ -30,40 +50,53 @@ func (m *MemstatCollector) Read(interval time.Duration) {
|
||||
}
|
||||
|
||||
ll := strings.Split(string(buffer), "\n")
|
||||
memstats := make(map[string]int64)
|
||||
|
||||
for _, line := range ll {
|
||||
ls := strings.Split(line, `:`)
|
||||
if len(ls) > 1 {
|
||||
lv := strings.Fields(ls[1])
|
||||
memstats[ls[0]], err = strconv.ParseInt(lv[0], 0, 64)
|
||||
m.stats[ls[0]], err = strconv.ParseInt(lv[0], 0, 64)
|
||||
}
|
||||
}
|
||||
|
||||
if _, exists := memstats[`MemTotal`]; !exists {
|
||||
if _, exists := m.stats[`MemTotal`]; !exists {
|
||||
err = errors.New("Parse error")
|
||||
log.Print(err)
|
||||
return
|
||||
}
|
||||
|
||||
m.node["mem_total"] = float64(memstats[`MemTotal`]) * 1.0e-3
|
||||
m.node["swap_total"] = float64(memstats[`SwapTotal`]) * 1.0e-3
|
||||
m.node["mem_sreclaimable"] = float64(memstats[`SReclaimable`]) * 1.0e-3
|
||||
m.node["mem_slab"] = float64(memstats[`Slab`]) * 1.0e-3
|
||||
m.node["mem_free"] = float64(memstats[`MemFree`]) * 1.0e-3
|
||||
m.node["mem_buffers"] = float64(memstats[`Buffers`]) * 1.0e-3
|
||||
m.node["mem_cached"] = float64(memstats[`Cached`]) * 1.0e-3
|
||||
m.node["mem_available"] = float64(memstats[`MemAvailable`]) * 1.0e-3
|
||||
m.node["swap_free"] = float64(memstats[`SwapFree`]) * 1.0e-3
|
||||
for match, name := range m.matches {
|
||||
if _, exists := m.stats[match]; !exists {
|
||||
err = errors.New(fmt.Sprintf("Parse error for %s : %s", match, name))
|
||||
log.Print(err)
|
||||
continue
|
||||
}
|
||||
y, err := lp.New(name, m.tags, map[string]interface{}{"value": int(float64(m.stats[match]) * 1.0e-3)}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
memUsed := memstats[`MemTotal`] - (memstats[`MemFree`] + memstats[`Buffers`] + memstats[`Cached`])
|
||||
m.node["mem_used"] = float64(memUsed) * 1.0e-3
|
||||
// In linux-2.5.52 when Memshared was removed
|
||||
if _, found := memstats[`MemShared`]; found {
|
||||
m.node["mem_shared"] = float64(memstats[`MemShared`]) * 1.0e-3
|
||||
if _, free := m.stats[`MemFree`]; free {
|
||||
if _, buffers := m.stats[`Buffers`]; buffers {
|
||||
if _, cached := m.stats[`Cached`]; cached {
|
||||
memUsed := m.stats[`MemTotal`] - (m.stats[`MemFree`] + m.stats[`Buffers`] + m.stats[`Cached`])
|
||||
y, err := lp.New("mem_used", m.tags, map[string]interface{}{"value": int(float64(memUsed) * 1.0e-3)}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if _, found := m.stats[`MemShared`]; found {
|
||||
y, err := lp.New("mem_shared", m.tags, map[string]interface{}{"value": int(float64(m.stats[`MemShared`]) * 1.0e-3)}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (m *MemstatCollector) Close() {
|
||||
m.init = false
|
||||
return
|
||||
}
|
||||
|
@@ -1,6 +1,7 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
lp "github.com/influxdata/line-protocol"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"strconv"
|
||||
@@ -11,48 +12,49 @@ import (
|
||||
type MetricGetter interface {
|
||||
Name() string
|
||||
Init() error
|
||||
Read(time.Duration)
|
||||
Read(time.Duration, *[]lp.MutableMetric)
|
||||
Close()
|
||||
GetNodeMetric() map[string]interface{}
|
||||
GetSocketMetrics() map[int]map[string]interface{}
|
||||
GetCpuMetrics() map[int]map[string]interface{}
|
||||
// GetNodeMetric() map[string]interface{}
|
||||
// GetSocketMetrics() map[int]map[string]interface{}
|
||||
// GetCpuMetrics() map[int]map[string]interface{}
|
||||
}
|
||||
|
||||
type MetricCollector struct {
|
||||
name string
|
||||
node map[string]interface{}
|
||||
sockets map[int]map[string]interface{}
|
||||
cpus map[int]map[string]interface{}
|
||||
init bool
|
||||
// node map[string]interface{}
|
||||
// sockets map[int]map[string]interface{}
|
||||
// cpus map[int]map[string]interface{}
|
||||
}
|
||||
|
||||
func (c *MetricCollector) Name() string {
|
||||
return c.name
|
||||
}
|
||||
|
||||
func (c *MetricCollector) GetNodeMetric() map[string]interface{} {
|
||||
return c.node
|
||||
}
|
||||
//func (c *MetricCollector) GetNodeMetric() map[string]interface{} {
|
||||
// return c.node
|
||||
//}
|
||||
|
||||
func (c *MetricCollector) GetSocketMetrics() map[int]map[string]interface{} {
|
||||
return c.sockets
|
||||
}
|
||||
//func (c *MetricCollector) GetSocketMetrics() map[int]map[string]interface{} {
|
||||
// return c.sockets
|
||||
//}
|
||||
|
||||
func (c *MetricCollector) GetCpuMetrics() map[int]map[string]interface{} {
|
||||
return c.cpus
|
||||
}
|
||||
//func (c *MetricCollector) GetCpuMetrics() map[int]map[string]interface{} {
|
||||
// return c.cpus
|
||||
//}
|
||||
|
||||
func (c *MetricCollector) setup() error {
|
||||
slist := SocketList()
|
||||
clist := CpuList()
|
||||
c.node = make(map[string]interface{})
|
||||
c.sockets = make(map[int]map[string]interface{}, len(slist))
|
||||
for _, s := range slist {
|
||||
c.sockets[s] = make(map[string]interface{})
|
||||
}
|
||||
c.cpus = make(map[int]map[string]interface{}, len(clist))
|
||||
for _, s := range clist {
|
||||
c.cpus[s] = make(map[string]interface{})
|
||||
}
|
||||
// slist := SocketList()
|
||||
// clist := CpuList()
|
||||
// c.node = make(map[string]interface{})
|
||||
// c.sockets = make(map[int]map[string]interface{}, len(slist))
|
||||
// for _, s := range slist {
|
||||
// c.sockets[s] = make(map[string]interface{})
|
||||
// }
|
||||
// c.cpus = make(map[int]map[string]interface{}, len(clist))
|
||||
// for _, s := range clist {
|
||||
// c.cpus[s] = make(map[string]interface{})
|
||||
// }
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@@ -1,7 +1,7 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
lp "github.com/influxdata/line-protocol"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"strconv"
|
||||
@@ -13,26 +13,33 @@ const NETSTATFILE = `/proc/net/dev`
|
||||
|
||||
type NetstatCollector struct {
|
||||
MetricCollector
|
||||
matches map[int]string
|
||||
tags map[string]string
|
||||
}
|
||||
|
||||
func (m *NetstatCollector) Init() error {
|
||||
m.name = "NetstatCollector"
|
||||
m.setup()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *NetstatCollector) Read(interval time.Duration) {
|
||||
data, err := ioutil.ReadFile(string(NETSTATFILE))
|
||||
if err != nil {
|
||||
log.Print(err.Error())
|
||||
return
|
||||
}
|
||||
var matches = map[int]string{
|
||||
m.tags = map[string]string{"type": "node"}
|
||||
m.matches = map[int]string{
|
||||
1: "bytes_in",
|
||||
9: "bytes_out",
|
||||
2: "pkts_in",
|
||||
10: "pkts_out",
|
||||
}
|
||||
_, err := ioutil.ReadFile(string(NETSTATFILE))
|
||||
if err == nil {
|
||||
m.init = true
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *NetstatCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
||||
data, err := ioutil.ReadFile(string(NETSTATFILE))
|
||||
if err != nil {
|
||||
log.Print(err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
lines := strings.Split(string(data), "\n")
|
||||
for _, l := range lines {
|
||||
@@ -44,10 +51,13 @@ func (m *NetstatCollector) Read(interval time.Duration) {
|
||||
if dev == "lo" {
|
||||
continue
|
||||
}
|
||||
for i, name := range matches {
|
||||
for i, name := range m.matches {
|
||||
v, err := strconv.ParseInt(f[i], 10, 0)
|
||||
if err == nil {
|
||||
m.node[fmt.Sprintf("%s_%s", dev, name)] = float64(v) * 1.0e-3
|
||||
y, err := lp.New(name, m.tags, map[string]interface{}{"value": int(float64(v) * 1.0e-3)}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -55,5 +65,6 @@ func (m *NetstatCollector) Read(interval time.Duration) {
|
||||
}
|
||||
|
||||
func (m *NetstatCollector) Close() {
|
||||
m.init = false
|
||||
return
|
||||
}
|
||||
|
@@ -4,6 +4,7 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
||||
lp "github.com/influxdata/line-protocol"
|
||||
"log"
|
||||
"time"
|
||||
)
|
||||
@@ -13,10 +14,21 @@ type NvidiaCollector struct {
|
||||
num_gpus int
|
||||
}
|
||||
|
||||
func (m *NvidiaCollector) CatchPanic() error {
|
||||
|
||||
if rerr := recover(); rerr != nil {
|
||||
log.Print("CatchPanic ", string(rerr.(string)))
|
||||
err := errors.New(rerr.(string))
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *NvidiaCollector) Init() error {
|
||||
m.name = "NvidiaCollector"
|
||||
m.setup()
|
||||
m.num_gpus = 0
|
||||
defer m.CatchPanic()
|
||||
ret := nvml.Init()
|
||||
if ret != nvml.SUCCESS {
|
||||
err := errors.New(nvml.ErrorString(ret))
|
||||
@@ -28,10 +40,11 @@ func (m *NvidiaCollector) Init() error {
|
||||
err := errors.New(nvml.ErrorString(ret))
|
||||
return err
|
||||
}
|
||||
m.init = true
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *NvidiaCollector) Read(interval time.Duration) {
|
||||
func (m *NvidiaCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
||||
|
||||
for i := 0; i < m.num_gpus; i++ {
|
||||
device, ret := nvml.DeviceGetHandleByIndex(i)
|
||||
@@ -39,113 +52,183 @@ func (m *NvidiaCollector) Read(interval time.Duration) {
|
||||
log.Fatalf("Unable to get device at index %d: %v", i, nvml.ErrorString(ret))
|
||||
return
|
||||
}
|
||||
base := fmt.Sprintf("gpu%d", i)
|
||||
tags := map[string]string{"type": "accelerator", "type-id": fmt.Sprintf("%d", i)}
|
||||
|
||||
util, ret := nvml.DeviceGetUtilizationRates(device)
|
||||
if ret == nvml.SUCCESS {
|
||||
m.node[fmt.Sprintf("%s_util", base)] = float64(util.Gpu)
|
||||
m.node[fmt.Sprintf("%s_mem_util", base)] = float64(util.Memory)
|
||||
y, err := lp.New("util", tags, map[string]interface{}{"value": float64(util.Gpu)}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
y, err = lp.New("mem_util", tags, map[string]interface{}{"value": float64(util.Memory)}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
meminfo, ret := nvml.DeviceGetMemoryInfo(device)
|
||||
if ret == nvml.SUCCESS {
|
||||
m.node[fmt.Sprintf("%s_mem_total", base)] = float64(meminfo.Total) / (1024 * 1024)
|
||||
m.node[fmt.Sprintf("%s_fb_memory", base)] = float64(meminfo.Used) / (1024 * 1024)
|
||||
t := float64(meminfo.Total) / (1024 * 1024)
|
||||
y, err := lp.New("mem_total", tags, map[string]interface{}{"value": t}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
f := float64(meminfo.Used) / (1024 * 1024)
|
||||
y, err = lp.New("fb_memory", tags, map[string]interface{}{"value": f}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
temp, ret := nvml.DeviceGetTemperature(device, nvml.TEMPERATURE_GPU)
|
||||
if ret == nvml.SUCCESS {
|
||||
m.node[fmt.Sprintf("%s_temp", base)] = float64(temp)
|
||||
y, err := lp.New("temp", tags, map[string]interface{}{"value": float64(temp)}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
fan, ret := nvml.DeviceGetFanSpeed(device)
|
||||
if ret == nvml.SUCCESS {
|
||||
m.node[fmt.Sprintf("%s_fan", base)] = float64(fan)
|
||||
y, err := lp.New("fan", tags, map[string]interface{}{"value": float64(fan)}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
_, ecc_pend, ret := nvml.DeviceGetEccMode(device)
|
||||
if ret == nvml.SUCCESS {
|
||||
var y lp.MutableMetric
|
||||
var err error
|
||||
switch ecc_pend {
|
||||
case nvml.FEATURE_DISABLED:
|
||||
m.node[fmt.Sprintf("%s_ecc_mode", base)] = string("OFF")
|
||||
y, err = lp.New("ecc_mode", tags, map[string]interface{}{"value": string("OFF")}, time.Now())
|
||||
case nvml.FEATURE_ENABLED:
|
||||
m.node[fmt.Sprintf("%s_ecc_mode", base)] = string("ON")
|
||||
y, err = lp.New("ecc_mode", tags, map[string]interface{}{"value": string("ON")}, time.Now())
|
||||
default:
|
||||
m.node[fmt.Sprintf("%s_ecc_mode", base)] = string("UNKNOWN")
|
||||
y, err = lp.New("ecc_mode", tags, map[string]interface{}{"value": string("UNKNOWN")}, time.Now())
|
||||
}
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
} else if ret == nvml.ERROR_NOT_SUPPORTED {
|
||||
m.node[fmt.Sprintf("%s_ecc_mode", base)] = string("N/A")
|
||||
y, err := lp.New("ecc_mode", tags, map[string]interface{}{"value": string("N/A")}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
pstate, ret := nvml.DeviceGetPerformanceState(device)
|
||||
if ret == nvml.SUCCESS {
|
||||
m.node[fmt.Sprintf("%s_perf_state", base)] = fmt.Sprintf("P%d", int(pstate))
|
||||
y, err := lp.New("perf_state", tags, map[string]interface{}{"value": fmt.Sprintf("P%d", int(pstate))}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
power, ret := nvml.DeviceGetPowerUsage(device)
|
||||
if ret == nvml.SUCCESS {
|
||||
m.node[fmt.Sprintf("%s_power_usage_report", base)] = float64(power) / 1000
|
||||
y, err := lp.New("power_usage_report", tags, map[string]interface{}{"value": float64(power) / 1000}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
gclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_GRAPHICS)
|
||||
if ret == nvml.SUCCESS {
|
||||
m.node[fmt.Sprintf("%s_graphics_clock_report", base)] = float64(gclk)
|
||||
y, err := lp.New("graphics_clock_report", tags, map[string]interface{}{"value": float64(gclk)}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM)
|
||||
if ret == nvml.SUCCESS {
|
||||
m.node[fmt.Sprintf("%s_sm_clock_report", base)] = float64(smclk)
|
||||
y, err := lp.New("sm_clock_report", tags, map[string]interface{}{"value": float64(smclk)}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM)
|
||||
if ret == nvml.SUCCESS {
|
||||
m.node[fmt.Sprintf("%s_mem_clock_report", base)] = float64(memclk)
|
||||
y, err := lp.New("mem_clock_report", tags, map[string]interface{}{"value": float64(memclk)}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
max_gclk, ret := nvml.DeviceGetMaxClockInfo(device, nvml.CLOCK_GRAPHICS)
|
||||
if ret == nvml.SUCCESS {
|
||||
m.node[fmt.Sprintf("%s_max_graphics_clock", base)] = float64(max_gclk)
|
||||
y, err := lp.New("max_graphics_clock", tags, map[string]interface{}{"value": float64(max_gclk)}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
max_smclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_SM)
|
||||
if ret == nvml.SUCCESS {
|
||||
m.node[fmt.Sprintf("%s_max_sm_clock", base)] = float64(max_smclk)
|
||||
y, err := lp.New("max_sm_clock", tags, map[string]interface{}{"value": float64(max_smclk)}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
max_memclk, ret := nvml.DeviceGetClockInfo(device, nvml.CLOCK_MEM)
|
||||
if ret == nvml.SUCCESS {
|
||||
m.node[fmt.Sprintf("%s_max_mem_clock", base)] = float64(max_memclk)
|
||||
y, err := lp.New("max_mem_clock", tags, map[string]interface{}{"value": float64(max_memclk)}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
ecc_db, ret := nvml.DeviceGetTotalEccErrors(device, 1, 1)
|
||||
if ret == nvml.SUCCESS {
|
||||
m.node[fmt.Sprintf("%s_ecc_db_error", base)] = float64(ecc_db)
|
||||
y, err := lp.New("ecc_db_error", tags, map[string]interface{}{"value": float64(ecc_db)}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
ecc_sb, ret := nvml.DeviceGetTotalEccErrors(device, 0, 1)
|
||||
if ret == nvml.SUCCESS {
|
||||
m.node[fmt.Sprintf("%s_ecc_sb_error", base)] = float64(ecc_sb)
|
||||
y, err := lp.New("ecc_sb_error", tags, map[string]interface{}{"value": float64(ecc_sb)}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
pwr_limit, ret := nvml.DeviceGetPowerManagementLimit(device)
|
||||
if ret == nvml.SUCCESS {
|
||||
m.node[fmt.Sprintf("%s_power_man_limit", base)] = float64(pwr_limit)
|
||||
y, err := lp.New("power_man_limit", tags, map[string]interface{}{"value": float64(pwr_limit)}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
enc_util, _, ret := nvml.DeviceGetEncoderUtilization(device)
|
||||
if ret == nvml.SUCCESS {
|
||||
m.node[fmt.Sprintf("%s_power_man_limit", base)] = float64(enc_util)
|
||||
y, err := lp.New("encoder_util", tags, map[string]interface{}{"value": float64(enc_util)}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
|
||||
dec_util, _, ret := nvml.DeviceGetDecoderUtilization(device)
|
||||
if ret == nvml.SUCCESS {
|
||||
m.node[fmt.Sprintf("%s_power_man_limit", base)] = float64(dec_util)
|
||||
y, err := lp.New("decoder_util", tags, map[string]interface{}{"value": float64(dec_util)}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func (m *NvidiaCollector) Close() {
|
||||
nvml.Shutdown()
|
||||
if m.init {
|
||||
nvml.Shutdown()
|
||||
m.init = false
|
||||
}
|
||||
return
|
||||
}
|
||||
|
@@ -2,6 +2,7 @@ package collectors
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
lp "github.com/influxdata/line-protocol"
|
||||
"log"
|
||||
"os/exec"
|
||||
"strings"
|
||||
@@ -12,15 +13,23 @@ const NUM_PROCS = 5
|
||||
|
||||
type TopProcsCollector struct {
|
||||
MetricCollector
|
||||
tags map[string]string
|
||||
}
|
||||
|
||||
func (m *TopProcsCollector) Init() error {
|
||||
m.name = "TopProcsCollector"
|
||||
m.tags = map[string]string{"type": "node"}
|
||||
m.setup()
|
||||
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
|
||||
command.Wait()
|
||||
_, err := command.Output()
|
||||
if err == nil {
|
||||
m.init = true
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *TopProcsCollector) Read(interval time.Duration) {
|
||||
func (m *TopProcsCollector) Read(interval time.Duration, out *[]lp.MutableMetric) {
|
||||
command := exec.Command("ps", "-Ao", "comm", "--sort=-pcpu")
|
||||
command.Wait()
|
||||
stdout, err := command.Output()
|
||||
@@ -31,10 +40,15 @@ func (m *TopProcsCollector) Read(interval time.Duration) {
|
||||
|
||||
lines := strings.Split(string(stdout), "\n")
|
||||
for i := 1; i < NUM_PROCS+1; i++ {
|
||||
m.node[fmt.Sprintf("topproc%d", i)] = lines[i]
|
||||
name := fmt.Sprintf("topproc%d", i)
|
||||
y, err := lp.New(name, m.tags, map[string]interface{}{"value": string(lines[i])}, time.Now())
|
||||
if err == nil {
|
||||
*out = append(*out, y)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (m *TopProcsCollector) Close() {
|
||||
m.init = false
|
||||
return
|
||||
}
|
||||
|
Reference in New Issue
Block a user