mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2024-11-10 04:27:25 +01:00
8d85bd53f1
* Cleanup: Remove unused code * Use Golang duration parser for 'interval' and 'duration' in main config * Update handling of LIKWID headers. Download only if not already present in the system. Fixes #73 * Units with cc-units (#64) * Add option to normalize units with cc-unit * Add unit conversion to router * Add option to change unit prefix in the router * Add to MetricRouter README * Add order of operations in router to README * Use second add_tags/del_tags only if metric gets renamed * Skip disks in DiskstatCollector that have size=0 * Check readability of sensor files in TempCollector * Fix for --once option * Rename `cpu` type to `hwthread` (#69) * Rename 'cpu' type to 'hwthread' to avoid naming clashes with MetricStore and CC-Webfrontend * Collectors in parallel (#74) * Provide info to CollectorManager whether the collector can be executed in parallel with others * Split serial and parallel collectors. Read in parallel first * Update NvidiaCollector with new metrics, MIG and NvLink support (#75) * CC topology module update (#76) * Rename CPU to hardware thread, write some comments * Do renaming in other parts * Remove CpuList and SocketList function from metricCollector. Available in ccTopology * Option to use MIG UUID as subtype-id in NvidiaCollector * Option to use MIG slice name as subtype-id in NvidiaCollector * MetricRouter: Fix JSON in README * Fix for Github Action to really use the selected version * Remove Ganglia installation in runonce Action and add Go 1.18 * Fix daemon options in init script * Add separate go.mod files to use it with deprecated 1.16 * Minor updates for Makefiles * fix string comparison * AMD ROCm SMI collector (#77) * Add collector for AMD ROCm SMI metrics * Fix import path * Fix imports * Remove Board Number * store GPU index explicitly * Remove board number from description * Use http instead of ftp to download likwid * Fix serial number in rocmCollector * Improved http sink (#78) * automatic flush in NatsSink * tweak default options of HttpSink * shorter cirt. section and retries for HttpSink * fix error handling * Remove file added by mistake. * Use http instead of ftp to download likwid * Fix serial number in rocmCollector Co-authored-by: Thomas Roehl <thomas.roehl@fau.de> Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Co-authored-by: Lou <lou.knauer@gmx.de>
152 lines
3.5 KiB
Go
152 lines
3.5 KiB
Go
package collectors
|
|
|
|
import (
|
|
"bufio"
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
|
|
lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
|
|
)
|
|
|
|
const CPUSTATFILE = `/proc/stat`
|
|
|
|
type CpustatCollectorConfig struct {
|
|
ExcludeMetrics []string `json:"exclude_metrics,omitempty"`
|
|
}
|
|
|
|
type CpustatCollector struct {
|
|
metricCollector
|
|
config CpustatCollectorConfig
|
|
matches map[string]int
|
|
cputags map[string]map[string]string
|
|
nodetags map[string]string
|
|
}
|
|
|
|
func (m *CpustatCollector) Init(config json.RawMessage) error {
|
|
m.name = "CpustatCollector"
|
|
m.setup()
|
|
m.parallel = true
|
|
m.meta = map[string]string{"source": m.name, "group": "CPU", "unit": "Percent"}
|
|
m.nodetags = map[string]string{"type": "node"}
|
|
if len(config) > 0 {
|
|
err := json.Unmarshal(config, &m.config)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
matches := map[string]int{
|
|
"cpu_user": 1,
|
|
"cpu_nice": 2,
|
|
"cpu_system": 3,
|
|
"cpu_idle": 4,
|
|
"cpu_iowait": 5,
|
|
"cpu_irq": 6,
|
|
"cpu_softirq": 7,
|
|
"cpu_steal": 8,
|
|
"cpu_guest": 9,
|
|
"cpu_guest_nice": 10,
|
|
}
|
|
|
|
m.matches = make(map[string]int)
|
|
for match, index := range matches {
|
|
doExclude := false
|
|
for _, exclude := range m.config.ExcludeMetrics {
|
|
if match == exclude {
|
|
doExclude = true
|
|
break
|
|
}
|
|
}
|
|
if !doExclude {
|
|
m.matches[match] = index
|
|
}
|
|
}
|
|
|
|
// Check input file
|
|
file, err := os.Open(string(CPUSTATFILE))
|
|
if err != nil {
|
|
cclog.ComponentError(m.name, err.Error())
|
|
}
|
|
defer file.Close()
|
|
|
|
// Pre-generate tags for all CPUs
|
|
num_cpus := 0
|
|
m.cputags = make(map[string]map[string]string)
|
|
scanner := bufio.NewScanner(file)
|
|
for scanner.Scan() {
|
|
line := scanner.Text()
|
|
linefields := strings.Fields(line)
|
|
if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 {
|
|
cpustr := strings.TrimLeft(linefields[0], "cpu")
|
|
cpu, _ := strconv.Atoi(cpustr)
|
|
m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)}
|
|
num_cpus++
|
|
}
|
|
}
|
|
m.init = true
|
|
return nil
|
|
}
|
|
|
|
func (m *CpustatCollector) parseStatLine(linefields []string, tags map[string]string, output chan lp.CCMetric) {
|
|
values := make(map[string]float64)
|
|
total := 0.0
|
|
for match, index := range m.matches {
|
|
if len(match) > 0 {
|
|
x, err := strconv.ParseInt(linefields[index], 0, 64)
|
|
if err == nil {
|
|
values[match] = float64(x)
|
|
total += values[match]
|
|
}
|
|
}
|
|
}
|
|
t := time.Now()
|
|
for name, value := range values {
|
|
y, err := lp.New(name, tags, m.meta, map[string]interface{}{"value": (value * 100.0) / total}, t)
|
|
if err == nil {
|
|
output <- y
|
|
}
|
|
}
|
|
}
|
|
|
|
func (m *CpustatCollector) Read(interval time.Duration, output chan lp.CCMetric) {
|
|
if !m.init {
|
|
return
|
|
}
|
|
num_cpus := 0
|
|
file, err := os.Open(string(CPUSTATFILE))
|
|
if err != nil {
|
|
cclog.ComponentError(m.name, err.Error())
|
|
}
|
|
defer file.Close()
|
|
|
|
scanner := bufio.NewScanner(file)
|
|
for scanner.Scan() {
|
|
line := scanner.Text()
|
|
linefields := strings.Fields(line)
|
|
if strings.Compare(linefields[0], "cpu") == 0 {
|
|
m.parseStatLine(linefields, m.nodetags, output)
|
|
} else if strings.HasPrefix(linefields[0], "cpu") {
|
|
m.parseStatLine(linefields, m.cputags[linefields[0]], output)
|
|
num_cpus++
|
|
}
|
|
}
|
|
|
|
num_cpus_metric, err := lp.New("num_cpus",
|
|
m.nodetags,
|
|
m.meta,
|
|
map[string]interface{}{"value": int(num_cpus)},
|
|
time.Now(),
|
|
)
|
|
if err == nil {
|
|
output <- num_cpus_metric
|
|
}
|
|
}
|
|
|
|
func (m *CpustatCollector) Close() {
|
|
m.init = false
|
|
}
|