From a2eba41150e9c7f6afb62adb3336bfd93816d00e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 8 Jun 2026 03:13:37 +0000 Subject: [PATCH 1/2] Bump golang.design/x/thread Bumps [golang.design/x/thread](https://github.com/golang-design/thread) from 0.0.0-20210122121316-335e9adffdf1 to 0.3.2. - [Release notes](https://github.com/golang-design/thread/releases) - [Commits](https://github.com/golang-design/thread/commits/v0.3.2) --- updated-dependencies: - dependency-name: golang.design/x/thread dependency-version: 0.3.2 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 2 +- go.sum | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/go.mod b/go.mod index 091096a..eb6fc6d 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,7 @@ require ( github.com/PaesslerAG/gval v1.2.4 github.com/fsnotify/fsnotify v1.10.1 github.com/tklauser/go-sysconf v0.4.0 - golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1 + golang.design/x/thread v0.3.2 golang.org/x/sys v0.45.0 ) diff --git a/go.sum b/go.sum index c2ad621..89754d1 100644 --- a/go.sum +++ b/go.sum @@ -173,8 +173,8 @@ go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.yaml.in/yaml/v2 v2.4.4 h1:tuyd0P+2Ont/d6e2rl3be67goVK4R6deVxCUX5vyPaQ= go.yaml.in/yaml/v2 v2.4.4/go.mod h1:gMZqIpDtDqOfM0uNfy0SkpRhvUryYH0Z6wdMYcacYXQ= -golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1 h1:P7S/GeHBAFEZIYp0ePPs2kHXoazz8q2KsyxHyQVGCJg= -golang.design/x/thread v0.0.0-20210122121316-335e9adffdf1/go.mod h1:9CWpnTUmlQkfdpdutA1nNf4iE5lAVt3QZOu0Z6hahBE= +golang.design/x/thread v0.3.2 h1:FmD1glspGrQCe6FuQLmSrT6wz2CSzq7vKVDluyiMnqo= +golang.design/x/thread v0.3.2/go.mod h1:6+Hi2rMOgMHZdKDWaqNHyWtoFUx1HxZ06LfHPh5Z/hQ= golang.org/x/crypto v0.50.0 h1:zO47/JPrL6vsNkINmLoo/PH1gcxpls50DNogFvB5ZGI= golang.org/x/crypto v0.50.0/go.mod h1:3muZ7vA7PBCE6xgPX7nkzzjiUq87kRItoJQM1Yo8S+Q= golang.org/x/exp v0.0.0-20231005195138-3e424a577f31 h1:9k5exFQKQglLo+RoP+4zMjOFE14P6+vyR0baDAi0Rcs= @@ -183,7 +183,6 @@ golang.org/x/mod v0.13.0 h1:I/DsJXRlw/8l/0c24sM9yb0T4z9liZTduXvdAWYiysY= golang.org/x/mod v0.13.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA= golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs= -golang.org/x/sys v0.0.0-20210122093101-04d7465088b8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.45.0 h1:dO4czNzziLiiXplLQgBCEpCvXQ3dnkn0SdaZSYdQ+FY= golang.org/x/sys v0.45.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U= From bed5491068d6e959bca0d904bcaf63b1f7d30808 Mon Sep 17 00:00:00 2001 From: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Date: Mon, 8 Jun 2026 14:00:09 +0200 Subject: [PATCH 2/2] Fix Overflows in Infiniband collector (#219) * Add information about the used infiniband counters * Change datatype from int64 to uint64 * uint64 subtraction handles wraparound automatically * Compute total rates by summing up the xmit and recv rates. This avoids overflows in the raw counters * Check for cases where the current counter can not be saved as last state * Use golang variable naming convention (camelCase) --- collectors/infinibandMetric.go | 156 ++++++++++++++++++++------------- 1 file changed, 93 insertions(+), 63 deletions(-) diff --git a/collectors/infinibandMetric.go b/collectors/infinibandMetric.go index d68eea1..0d15311 100644 --- a/collectors/infinibandMetric.go +++ b/collectors/infinibandMetric.go @@ -23,20 +23,29 @@ import ( "golang.org/x/sys/unix" ) -const IB_BASEPATH = "/sys/class/infiniband/" +// See: https://www.kernel.org/doc/Documentation/ABI/stable/sysfs-class-infiniband +const ( + ibBasePath = "/sys/class/infiniband/" + ibDataUnit = "bytes" + ibDataRateUnit = ibDataUnit + "/sec" + ibPkgUnit = "packets" + ibPkgRateUnit = ibPkgUnit + "/sec" +) type InfinibandCollectorMetric struct { - name string - path string - unit string - scale int64 - addToIBTotal bool - addToIBTotalPkgs bool - lastState int64 + name string + path string + unit string + unitRates string + scaleByFourLanes bool + addToIBTotal bool + addToIBTotalPkgs bool + lastState uint64 + lastStateAvailable bool } type InfinibandCollectorInfo struct { - LID string // IB local Identifier (LID) + lid string // IB local Identifier (LID) device string // IB device port string // IB device port portCounterFiles []InfinibandCollectorMetric // mapping counter name -> InfinibandCollectorMetric @@ -56,7 +65,7 @@ type InfinibandCollector struct { lastTimestamp time.Time // Store time stamp of last tick to derive bandwidths } -// Init initializes the Infiniband collector by walking through files below IB_BASEPATH +// Init initializes the Infiniband collector by walking through files below ibBasePath func (m *InfinibandCollector) Init(config json.RawMessage) error { // Check if already initialized if m.init { @@ -87,7 +96,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error { } // Loop for all InfiniBand directories - globPattern := filepath.Join(IB_BASEPATH, "*", "ports", "*") + globPattern := filepath.Join(ibBasePath, "*", "ports", "*") ibDirs, err := filepath.Glob(globPattern) if err != nil { return fmt.Errorf("%s Init(): unable to glob files with pattern %s: %w", m.name, globPattern, err) @@ -122,36 +131,42 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error { countersDir := filepath.Join(path, "counters") portCounterFiles := []InfinibandCollectorMetric{ { - name: "ib_recv", - path: filepath.Join(countersDir, "port_rcv_data"), - unit: "bytes", - scale: 4, - addToIBTotal: true, - lastState: -1, + // Total number of data octets, divided by 4 (lanes), received on all VLs. + // This is 64 bit counter + name: "ib_recv", + path: filepath.Join(countersDir, "port_rcv_data"), + unit: ibDataUnit, + unitRates: ibDataRateUnit, + scaleByFourLanes: true, + addToIBTotal: true, }, { - name: "ib_xmit", - path: filepath.Join(countersDir, "port_xmit_data"), - unit: "bytes", - scale: 4, - addToIBTotal: true, - lastState: -1, + // Total number of data octets, divided by 4 (lanes), transmitted on all VLs. + // This is 64 bit counter + name: "ib_xmit", + path: filepath.Join(countersDir, "port_xmit_data"), + unit: ibDataUnit, + unitRates: ibDataRateUnit, + scaleByFourLanes: true, + addToIBTotal: true, }, { + // Total number of packets received on all VLs from this port (this may include packets containing Errors. + // This is 64 bit counter. name: "ib_recv_pkts", path: filepath.Join(countersDir, "port_rcv_packets"), - unit: "packets", - scale: 1, + unit: ibPkgUnit, + unitRates: ibPkgRateUnit, addToIBTotalPkgs: true, - lastState: -1, }, { + // Total number of packets transmitted on all VLs from this port. This may include packets with errors. + // This is 64 bit counter. name: "ib_xmit_pkts", path: filepath.Join(countersDir, "port_xmit_packets"), - unit: "packets", - scale: 1, + unit: ibPkgUnit, + unitRates: ibPkgRateUnit, addToIBTotalPkgs: true, - lastState: -1, }, } for _, counter := range portCounterFiles { @@ -163,7 +178,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error { m.info = append(m.info, InfinibandCollectorInfo{ - LID: LID, + lid: LID, device: device, port: port, portCounterFiles: portCounterFiles, @@ -184,7 +199,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error { return nil } -// Read reads Infiniband counter files below IB_BASEPATH +// Read reads Infiniband counter files below ibBasePath func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMessage) { // Check if already initialized if !m.init { @@ -201,9 +216,9 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess for i := range m.info { info := &m.info[i] - var ib_total, ib_total_last_state, - ib_total_pkts, ib_total_pkts_last_state int64 - var ib_total_last_state_available, ib_total_pkts_last_state_available bool + var ibTotal, ibTotalPkts uint64 // sum of xmit and recv counters + var ibTotalBw, ibTotalPktsBw float64 // sum of xmit and recv rates + var ibTotalBwAvailable, ibTotalPktsBwAvailable bool for i := range info.portCounterFiles { counterDef := &info.portCounterFiles[i] @@ -213,24 +228,30 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess cclog.ComponentError( m.name, fmt.Sprintf("Read(): Failed to read from file '%s': %v", counterDef.path, err)) + // Current counter can not be saved as last state + counterDef.lastStateAvailable = false continue } data := strings.TrimSpace(string(line)) - // convert counter to int64 - v, err := strconv.ParseInt(data, 10, 64) + // convert counter to uint64 + vRawCounter, err := strconv.ParseUint(data, 10, 64) if err != nil { cclog.ComponentError( m.name, - fmt.Sprintf("Read(): Failed to convert Infininiband metrice %s='%s' to int64: %v", counterDef.name, data, err)) + fmt.Sprintf("Read(): Failed to convert Infininiband metrice %s='%s' to uint64: %v", counterDef.name, data, err)) + // Current counter can not be saved as last state + counterDef.lastStateAvailable = false continue } - // Scale raw value - v *= counterDef.scale + vScaledCounter := vRawCounter + if counterDef.scaleByFourLanes { + vScaledCounter *= uint64(4) + } // Send absolut values if m.config.SendAbsoluteValues { - if y, err := lp.NewMetric(counterDef.name, info.tagSet, m.meta, v, now); err == nil { + if y, err := lp.NewMetric(counterDef.name, info.tagSet, m.meta, vScaledCounter, now); err == nil { y.AddMeta("unit", counterDef.unit) output <- y } @@ -238,63 +259,72 @@ func (m *InfinibandCollector) Read(interval time.Duration, output chan lp.CCMess // Send derived values if m.config.SendDerivedValues { - if counterDef.lastState >= 0 { - rate := float64((v - counterDef.lastState)) / timeDiff + if counterDef.lastStateAvailable { + var rate float64 + // uint64 subtraction handles wraparound automatically + // in case vRawCounter < counterDef.lastState we would compute: + // math.MaxUint64 - lastState + vRawCounter + 1 + // = (2^64 - 1) - lastState + vRawCounter + 1 + // = 2^64 - lastState + vRawCounter + // ≡ vRawCounter - lastState (mod 2^64) + rate = float64(vRawCounter-counterDef.lastState) / timeDiff + if counterDef.scaleByFourLanes { + rate *= float64(4) + } if y, err := lp.NewMetric(counterDef.name+"_bw", info.tagSet, m.meta, rate, now); err == nil { - y.AddMeta("unit", counterDef.unit+"/sec") + y.AddMeta("unit", counterDef.unitRates) output <- y } - // Sum up total values of last state + // Sum up rates for total rates if m.config.SendTotalValues { switch { case counterDef.addToIBTotal: - ib_total_last_state += counterDef.lastState - ib_total_last_state_available = true + ibTotalBw += rate + ibTotalBwAvailable = true case counterDef.addToIBTotalPkgs: - ib_total_pkts_last_state += counterDef.lastState - ib_total_pkts_last_state_available = true + ibTotalPktsBw += rate + ibTotalPktsBwAvailable = true } } } - counterDef.lastState = v + counterDef.lastState = vRawCounter + counterDef.lastStateAvailable = true } // Sum up total values if m.config.SendTotalValues { switch { case counterDef.addToIBTotal: - ib_total += v + ibTotal += vScaledCounter case counterDef.addToIBTotalPkgs: - ib_total_pkts += v + ibTotalPkts += vScaledCounter } } } // Send total values if m.config.SendTotalValues { - if y, err := lp.NewMetric("ib_total", info.tagSet, m.meta, ib_total, now); err == nil { - y.AddMeta("unit", "bytes") + if y, err := lp.NewMetric("ib_total", info.tagSet, m.meta, ibTotal, now); err == nil { + y.AddMeta("unit", ibDataUnit) output <- y } - if y, err := lp.NewMetric("ib_total_pkts", info.tagSet, m.meta, ib_total_pkts, now); err == nil { - y.AddMeta("unit", "packets") + if y, err := lp.NewMetric("ib_total_pkts", info.tagSet, m.meta, ibTotalPkts, now); err == nil { + y.AddMeta("unit", ibPkgUnit) output <- y } - if m.config.SendDerivedValues && ib_total_last_state_available { - rate := float64((ib_total - ib_total_last_state)) / timeDiff - if y, err := lp.NewMetric("ib_total_bw", info.tagSet, m.meta, rate, now); err == nil { - y.AddMeta("unit", "bytes/sec") + if m.config.SendDerivedValues && ibTotalBwAvailable { + if y, err := lp.NewMetric("ib_total_bw", info.tagSet, m.meta, ibTotalBw, now); err == nil { + y.AddMeta("unit", ibDataRateUnit) output <- y } } - if m.config.SendDerivedValues && ib_total_pkts_last_state_available { - rate := float64((ib_total_pkts - ib_total_pkts_last_state)) / timeDiff - if y, err := lp.NewMetric("ib_total_pkts_bw", info.tagSet, m.meta, rate, now); err == nil { - y.AddMeta("unit", "packets/sec") + if m.config.SendDerivedValues && ibTotalPktsBwAvailable { + if y, err := lp.NewMetric("ib_total_pkts_bw", info.tagSet, m.meta, ibTotalPktsBw, now); err == nil { + y.AddMeta("unit", ibPkgRateUnit) output <- y } }