mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-01-13 15:49:06 +01:00
7840de7b82
* Add cpu_used (all-cpu_idle) to CpustatCollector * Update cc-metric-collector.init * Allow selection of timestamp precision in HttpSink * Add comment about precision requirement for cc-metric-store * Fix for API changes in gofish@v0.15.0 * Update requirements to latest version * Read sensors through redfish * Update golang toolchain to 1.21 * Remove stray error check * Update main config in configuration.md * Update Release action to use golang 1.22 stable release, no golang RPMs anymore * Update runonce action to use golang 1.22 stable release, no golang RPMs anymore * Update README.md Use right JSON type in configuration * Update sink's README * Test whether ipmitool or ipmi-sensors can be executed without errors * Little fixes to the prometheus sink (#115) * Add uint64 to float64 cast option * Add prometheus sink to the list of available sinks * Add aggregated counters by gpu for nvlink errors --------- Co-authored-by: Michael Schwarz <schwarz@uni-paderborn.de> * Ccmessage migration (#119) * Add cpu_used (all-cpu_idle) to CpustatCollector * Update cc-metric-collector.init * Allow selection of timestamp precision in HttpSink * Add comment about precision requirement for cc-metric-store * Fix for API changes in gofish@v0.15.0 * Update requirements to latest version * Read sensors through redfish * Update golang toolchain to 1.21 * Remove stray error check * Update main config in configuration.md * Update Release action to use golang 1.22 stable release, no golang RPMs anymore * Update runonce action to use golang 1.22 stable release, no golang RPMs anymore * Switch to CCMessage for all files. --------- Co-authored-by: Holger Obermaier <Holger.Obermaier@kit.edu> Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> * Switch to ccmessage also for latest additions in nvidiaMetric * New Message processor (#118) * Add cpu_used (all-cpu_idle) to CpustatCollector * Update cc-metric-collector.init * Allow selection of timestamp precision in HttpSink * Add comment about precision requirement for cc-metric-store * Fix for API changes in gofish@v0.15.0 * Update requirements to latest version * Read sensors through redfish * Update golang toolchain to 1.21 * Remove stray error check * Update main config in configuration.md * Update Release action to use golang 1.22 stable release, no golang RPMs anymore * Update runonce action to use golang 1.22 stable release, no golang RPMs anymore * New message processor to check whether a message should be dropped or manipulate it in flight * Create a copy of message before manipulation --------- Co-authored-by: Holger Obermaier <Holger.Obermaier@kit.edu> Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> * Update collector's Makefile and go.mod/sum files * Use message processor in router, all sinks and all receivers * Add support for credential file (NKEY) to NATS sink and receiver * Fix JSON keys in message processor configuration * Update docs for message processor, router and the default router config file * Add link to expr syntax and fix regex matching docs * Update sample collectors * Minor style change in collector manager * Some helpers for ccTopology * LIKWID collector: write log owner change only once * Fix for metrics without units and reduce debugging messages for messageProcessor * Use shorted hostname for hostname added by router * Define default port for NATS * CPUstat collector: only add unit for applicable metrics * Add precision option to all sinks using Influx's encoder * Add message processor to all sink documentation * Add units to documentation of cpustat collector --------- Co-authored-by: Holger Obermaier <Holger.Obermaier@kit.edu> Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Co-authored-by: oscarminus <me@oscarminus.de> Co-authored-by: Michael Schwarz <schwarz@uni-paderborn.de>
464 lines
12 KiB
Go
464 lines
12 KiB
Go
package ccTopology
|
|
|
|
import (
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
"path/filepath"
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
|
|
cclogger "github.com/ClusterCockpit/cc-metric-collector/pkg/ccLogger"
|
|
"golang.org/x/exp/slices"
|
|
)
|
|
|
|
const SYSFS_CPUBASE = `/sys/devices/system/cpu`
|
|
|
|
// Structure holding all information about a hardware thread
|
|
// See https://www.kernel.org/doc/Documentation/ABI/stable/sysfs-devices-system-cpu
|
|
type HwthreadEntry struct {
|
|
// for each CPUx:
|
|
CpuID int // CPU / hardware thread ID
|
|
SMT int // Simultaneous Multithreading ID
|
|
CoreCPUsList []int // CPUs within the same core
|
|
Core int // Socket local core ID
|
|
Socket int // Sockets (physical) ID
|
|
Die int // Die ID
|
|
NumaDomain int // NUMA Domain
|
|
}
|
|
|
|
var cache struct {
|
|
HwthreadList []int // List of CPU hardware threads
|
|
SMTList []int // List of symmetric hyper threading IDs
|
|
CoreList []int // List of CPU core IDs
|
|
SocketList []int // List of CPU sockets (physical) IDs
|
|
DieList []int // List of CPU Die IDs
|
|
NumaDomainList []int // List of NUMA Domains
|
|
|
|
CpuData []HwthreadEntry
|
|
}
|
|
|
|
// fileToInt reads an integer value from a sysfs file
|
|
// In case of an error -1 is returned
|
|
func fileToInt(path string) int {
|
|
buffer, err := os.ReadFile(path)
|
|
if err != nil {
|
|
log.Print(err)
|
|
cclogger.ComponentError("ccTopology", "fileToInt", "Reading", path, ":", err.Error())
|
|
return -1
|
|
}
|
|
stringBuffer := strings.TrimSpace(string(buffer))
|
|
id, err := strconv.Atoi(stringBuffer)
|
|
if err != nil {
|
|
cclogger.ComponentError("ccTopology", "fileToInt", "Parsing", path, ":", stringBuffer, err.Error())
|
|
return -1
|
|
}
|
|
return id
|
|
}
|
|
|
|
// fileToList reads a list from a sysfs file
|
|
// A list consists of value ranges separated by colon
|
|
// A range can be a single value or a range of values given by a startValue-endValue
|
|
// In case of an error nil is returned
|
|
func fileToList(path string) []int {
|
|
// Read list
|
|
buffer, err := os.ReadFile(path)
|
|
if err != nil {
|
|
log.Print(err)
|
|
cclogger.ComponentError("ccTopology", "fileToList", "Reading", path, ":", err.Error())
|
|
return nil
|
|
}
|
|
|
|
// Create list
|
|
list := make([]int, 0)
|
|
stringBuffer := strings.TrimSpace(string(buffer))
|
|
for _, valueRangeString := range strings.Split(stringBuffer, ",") {
|
|
valueRange := strings.Split(valueRangeString, "-")
|
|
switch len(valueRange) {
|
|
case 1:
|
|
singleValue, err := strconv.Atoi(valueRange[0])
|
|
if err != nil {
|
|
cclogger.ComponentError("CCTopology", "fileToList", "Parsing", valueRange[0], ":", err.Error())
|
|
return nil
|
|
}
|
|
list = append(list, singleValue)
|
|
case 2:
|
|
startValue, err := strconv.Atoi(valueRange[0])
|
|
if err != nil {
|
|
cclogger.ComponentError("CCTopology", "fileToList", "Parsing", valueRange[0], ":", err.Error())
|
|
return nil
|
|
}
|
|
endValue, err := strconv.Atoi(valueRange[1])
|
|
if err != nil {
|
|
cclogger.ComponentError("CCTopology", "fileToList", "Parsing", valueRange[1], ":", err.Error())
|
|
return nil
|
|
}
|
|
for value := startValue; value <= endValue; value++ {
|
|
list = append(list, value)
|
|
}
|
|
}
|
|
}
|
|
|
|
return list
|
|
}
|
|
|
|
// init initializes the cache structure
|
|
func init() {
|
|
|
|
getHWThreads :=
|
|
func() []int {
|
|
globPath := filepath.Join(SYSFS_CPUBASE, "cpu[0-9]*")
|
|
regexPath := filepath.Join(SYSFS_CPUBASE, "cpu([[:digit:]]+)")
|
|
regex := regexp.MustCompile(regexPath)
|
|
|
|
// File globbing for hardware threads
|
|
files, err := filepath.Glob(globPath)
|
|
if err != nil {
|
|
cclogger.ComponentError("CCTopology", "init:getHWThreads", err.Error())
|
|
return nil
|
|
}
|
|
|
|
hwThreadIDs := make([]int, len(files))
|
|
for i, file := range files {
|
|
// Extract hardware thread ID
|
|
matches := regex.FindStringSubmatch(file)
|
|
if len(matches) != 2 {
|
|
cclogger.ComponentError("CCTopology", "init:getHWThreads: Failed to extract hardware thread ID from ", file)
|
|
return nil
|
|
}
|
|
|
|
// Convert hardware thread ID to int
|
|
id, err := strconv.Atoi(matches[1])
|
|
if err != nil {
|
|
cclogger.ComponentError("CCTopology", "init:getHWThreads: Failed to convert to int hardware thread ID ", matches[1])
|
|
return nil
|
|
}
|
|
|
|
hwThreadIDs[i] = id
|
|
}
|
|
|
|
// Sort hardware thread IDs
|
|
slices.Sort(hwThreadIDs)
|
|
return hwThreadIDs
|
|
}
|
|
|
|
getNumaDomain :=
|
|
func(basePath string) int {
|
|
globPath := filepath.Join(basePath, "node*")
|
|
regexPath := filepath.Join(basePath, "node([[:digit:]]+)")
|
|
regex := regexp.MustCompile(regexPath)
|
|
|
|
// File globbing for NUMA node
|
|
files, err := filepath.Glob(globPath)
|
|
if err != nil {
|
|
cclogger.ComponentError("CCTopology", "init:getNumaDomain", err.Error())
|
|
return -1
|
|
}
|
|
|
|
// Check, that exactly one NUMA domain was found
|
|
if len(files) != 1 {
|
|
cclogger.ComponentError("CCTopology", "init:getNumaDomain", "Number of NUMA domains != 1: ", len(files))
|
|
return -1
|
|
}
|
|
|
|
// Extract NUMA node ID
|
|
matches := regex.FindStringSubmatch(files[0])
|
|
if len(matches) != 2 {
|
|
cclogger.ComponentError("CCTopology", "init:getNumaDomain", "Failed to extract NUMA node ID from: ", files[0])
|
|
return -1
|
|
}
|
|
|
|
id, err := strconv.Atoi(matches[1])
|
|
if err != nil {
|
|
cclogger.ComponentError("CCTopology", "init:getNumaDomain", "Failed to parse NUMA node ID from: ", matches[1])
|
|
return -1
|
|
}
|
|
|
|
return id
|
|
}
|
|
|
|
cache.HwthreadList = getHWThreads()
|
|
cache.CoreList = make([]int, len(cache.HwthreadList))
|
|
cache.SocketList = make([]int, len(cache.HwthreadList))
|
|
cache.DieList = make([]int, len(cache.HwthreadList))
|
|
cache.SMTList = make([]int, len(cache.HwthreadList))
|
|
cache.NumaDomainList = make([]int, len(cache.HwthreadList))
|
|
cache.CpuData = make([]HwthreadEntry, len(cache.HwthreadList))
|
|
for i, c := range cache.HwthreadList {
|
|
// Set cpuBase directory for topology lookup
|
|
cpuBase := filepath.Join(SYSFS_CPUBASE, fmt.Sprintf("cpu%d", c))
|
|
topoBase := filepath.Join(cpuBase, "topology")
|
|
|
|
// Lookup Core ID
|
|
cache.CoreList[i] = fileToInt(filepath.Join(topoBase, "core_id"))
|
|
|
|
// Lookup socket / physical package ID
|
|
cache.SocketList[i] = fileToInt(filepath.Join(topoBase, "physical_package_id"))
|
|
|
|
// Lookup CPU die id
|
|
cache.DieList[i] = fileToInt(filepath.Join(topoBase, "die_id"))
|
|
if cache.DieList[i] < 0 {
|
|
cache.DieList[i] = cache.SocketList[i]
|
|
}
|
|
|
|
// Lookup List of CPUs within the same core
|
|
coreCPUsList := fileToList(filepath.Join(topoBase, "core_cpus_list"))
|
|
|
|
// Find index of CPU ID in List of CPUs within the same core
|
|
// if not found return -1
|
|
cache.SMTList[i] = slices.Index(coreCPUsList, c)
|
|
|
|
// Lookup NUMA domain id
|
|
cache.NumaDomainList[i] = getNumaDomain(cpuBase)
|
|
|
|
cache.CpuData[i] =
|
|
HwthreadEntry{
|
|
CpuID: cache.HwthreadList[i],
|
|
SMT: cache.SMTList[i],
|
|
CoreCPUsList: coreCPUsList,
|
|
Socket: cache.SocketList[i],
|
|
NumaDomain: cache.NumaDomainList[i],
|
|
Die: cache.DieList[i],
|
|
Core: cache.CoreList[i],
|
|
}
|
|
}
|
|
|
|
slices.Sort(cache.HwthreadList)
|
|
cache.HwthreadList = slices.Compact(cache.HwthreadList)
|
|
|
|
slices.Sort(cache.SMTList)
|
|
cache.SMTList = slices.Compact(cache.SMTList)
|
|
|
|
slices.Sort(cache.CoreList)
|
|
cache.CoreList = slices.Compact(cache.CoreList)
|
|
|
|
slices.Sort(cache.SocketList)
|
|
cache.SocketList = slices.Compact(cache.SocketList)
|
|
|
|
slices.Sort(cache.DieList)
|
|
cache.DieList = slices.Compact(cache.DieList)
|
|
|
|
slices.Sort(cache.NumaDomainList)
|
|
cache.NumaDomainList = slices.Compact(cache.NumaDomainList)
|
|
}
|
|
|
|
// SocketList gets the list of CPU socket IDs
|
|
func SocketList() []int {
|
|
return slices.Clone(cache.SocketList)
|
|
}
|
|
|
|
// HwthreadList gets the list of hardware thread IDs in the order of listing in /proc/cpuinfo
|
|
func HwthreadList() []int {
|
|
return slices.Clone(cache.HwthreadList)
|
|
}
|
|
|
|
// Get list of hardware thread IDs in the order of listing in /proc/cpuinfo
|
|
// Deprecated! Use HwthreadList()
|
|
func CpuList() []int {
|
|
return HwthreadList()
|
|
}
|
|
|
|
// CoreList gets the list of CPU core IDs in the order of listing in /proc/cpuinfo
|
|
func CoreList() []int {
|
|
return slices.Clone(cache.CoreList)
|
|
}
|
|
|
|
// Get list of NUMA node IDs
|
|
func NumaNodeList() []int {
|
|
return slices.Clone(cache.NumaDomainList)
|
|
}
|
|
|
|
// DieList gets the list of CPU die IDs
|
|
func DieList() []int {
|
|
if len(cache.DieList) > 0 {
|
|
return slices.Clone(cache.DieList)
|
|
}
|
|
return SocketList()
|
|
}
|
|
|
|
// GetTypeList gets the list of specified type using the naming format inside ClusterCockpit
|
|
func GetTypeList(topology_type string) []int {
|
|
switch topology_type {
|
|
case "node":
|
|
return []int{0}
|
|
case "socket":
|
|
return SocketList()
|
|
case "die":
|
|
return DieList()
|
|
case "memoryDomain":
|
|
return NumaNodeList()
|
|
case "core":
|
|
return CoreList()
|
|
case "hwthread":
|
|
return HwthreadList()
|
|
}
|
|
return []int{}
|
|
}
|
|
|
|
func GetTypeId(hwt HwthreadEntry, topology_type string) (int, error) {
|
|
var err error = nil
|
|
switch topology_type {
|
|
case "node":
|
|
return 0, err
|
|
case "socket":
|
|
return hwt.Socket, err
|
|
case "die":
|
|
return hwt.Die, err
|
|
case "memoryDomain":
|
|
return hwt.NumaDomain, err
|
|
case "core":
|
|
return hwt.Core, err
|
|
case "hwthread":
|
|
return hwt.CpuID, err
|
|
}
|
|
return -1, fmt.Errorf("unknown topology type '%s'", topology_type)
|
|
}
|
|
|
|
// CpuData returns CPU data for each hardware thread
|
|
func CpuData() []HwthreadEntry {
|
|
// return a deep copy to protect cache data
|
|
c := slices.Clone(cache.CpuData)
|
|
for i := range c {
|
|
c[i].CoreCPUsList = slices.Clone(cache.CpuData[i].CoreCPUsList)
|
|
}
|
|
return c
|
|
}
|
|
|
|
// Structure holding basic information about a CPU
|
|
type CpuInformation struct {
|
|
NumHWthreads int
|
|
SMTWidth int
|
|
NumSockets int
|
|
NumDies int
|
|
NumCores int
|
|
NumNumaDomains int
|
|
}
|
|
|
|
// CpuInformation reports basic information about the CPU
|
|
func CpuInfo() CpuInformation {
|
|
return CpuInformation{
|
|
NumNumaDomains: len(cache.NumaDomainList),
|
|
SMTWidth: len(cache.SMTList),
|
|
NumDies: len(cache.DieList),
|
|
NumCores: len(cache.CoreList),
|
|
NumSockets: len(cache.SocketList),
|
|
NumHWthreads: len(cache.HwthreadList),
|
|
}
|
|
}
|
|
|
|
// GetHwthreadSocket gets the CPU socket ID for a given hardware thread ID
|
|
// In case hardware thread ID is not found -1 is returned
|
|
func GetHwthreadSocket(cpuID int) int {
|
|
for i := range cache.CpuData {
|
|
d := &cache.CpuData[i]
|
|
if d.CpuID == cpuID {
|
|
return d.Socket
|
|
}
|
|
}
|
|
return -1
|
|
}
|
|
|
|
// GetHwthreadNumaDomain gets the NUMA domain ID for a given hardware thread ID
|
|
// In case hardware thread ID is not found -1 is returned
|
|
func GetHwthreadNumaDomain(cpuID int) int {
|
|
for i := range cache.CpuData {
|
|
d := &cache.CpuData[i]
|
|
if d.CpuID == cpuID {
|
|
return d.NumaDomain
|
|
}
|
|
}
|
|
return -1
|
|
}
|
|
|
|
// GetHwthreadDie gets the CPU die ID for a given hardware thread ID
|
|
// In case hardware thread ID is not found -1 is returned
|
|
func GetHwthreadDie(cpuID int) int {
|
|
for i := range cache.CpuData {
|
|
d := &cache.CpuData[i]
|
|
if d.CpuID == cpuID {
|
|
return d.Die
|
|
}
|
|
}
|
|
return -1
|
|
}
|
|
|
|
// GetHwthreadCore gets the CPU core ID for a given hardware thread ID
|
|
// In case hardware thread ID is not found -1 is returned
|
|
func GetHwthreadCore(cpuID int) int {
|
|
for i := range cache.CpuData {
|
|
d := &cache.CpuData[i]
|
|
if d.CpuID == cpuID {
|
|
return d.Core
|
|
}
|
|
}
|
|
return -1
|
|
}
|
|
|
|
// GetSocketHwthreads gets all hardware thread IDs associated with a CPU socket
|
|
func GetSocketHwthreads(socket int) []int {
|
|
cpuList := make([]int, 0)
|
|
for i := range cache.CpuData {
|
|
d := &cache.CpuData[i]
|
|
if d.Socket == socket {
|
|
cpuList = append(cpuList, d.CpuID)
|
|
}
|
|
}
|
|
return cpuList
|
|
}
|
|
|
|
// GetNumaDomainHwthreads gets the all hardware thread IDs associated with a NUMA domain
|
|
func GetNumaDomainHwthreads(numaDomain int) []int {
|
|
cpuList := make([]int, 0)
|
|
for i := range cache.CpuData {
|
|
d := &cache.CpuData[i]
|
|
if d.NumaDomain == numaDomain {
|
|
cpuList = append(cpuList, d.CpuID)
|
|
}
|
|
}
|
|
return cpuList
|
|
}
|
|
|
|
// GetDieHwthreads gets all hardware thread IDs associated with a CPU die
|
|
func GetDieHwthreads(die int) []int {
|
|
cpuList := make([]int, 0)
|
|
for i := range cache.CpuData {
|
|
d := &cache.CpuData[i]
|
|
if d.Die == die {
|
|
cpuList = append(cpuList, d.CpuID)
|
|
}
|
|
}
|
|
return cpuList
|
|
}
|
|
|
|
// GetCoreHwthreads get all hardware thread IDs associated with a CPU core
|
|
func GetCoreHwthreads(core int) []int {
|
|
cpuList := make([]int, 0)
|
|
for i := range cache.CpuData {
|
|
d := &cache.CpuData[i]
|
|
if d.Core == core {
|
|
cpuList = append(cpuList, d.CpuID)
|
|
}
|
|
}
|
|
return cpuList
|
|
}
|
|
|
|
// GetTypeList gets the list of specified type using the naming format inside ClusterCockpit
|
|
func GetTypeHwthreads(topology_type string, id int) []int {
|
|
switch topology_type {
|
|
case "node":
|
|
return HwthreadList()
|
|
case "socket":
|
|
return GetSocketHwthreads(id)
|
|
case "die":
|
|
return GetDieHwthreads(id)
|
|
case "memoryDomain":
|
|
return GetNumaDomainHwthreads(id)
|
|
case "core":
|
|
return GetCoreHwthreads(id)
|
|
case "hwthread":
|
|
return []int{id}
|
|
}
|
|
return []int{}
|
|
}
|