mirror of
https://github.com/ClusterCockpit/cc-metric-collector.git
synced 2025-10-23 22:35:07 +02:00
Merge pull request #2 from ClusterCockpit/dev-tom
Initial working implementation with integrated likwid and central time loop
This commit is contained in:
320
clusterdaemon.go
320
clusterdaemon.go
@@ -1,320 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
|
||||
//"bytes"
|
||||
// "context"
|
||||
"encoding/json"
|
||||
"path/filepath"
|
||||
|
||||
//"sort"
|
||||
"errors"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
protocol "github.com/influxdata/line-protocol"
|
||||
)
|
||||
|
||||
type GlobalConfig struct {
|
||||
Sink struct {
|
||||
User string `json:"user"`
|
||||
Password string `json:"password"`
|
||||
} `json:"sink"`
|
||||
Host string `json:"host"`
|
||||
Port string `json:"port"`
|
||||
Report struct {
|
||||
Levels string `json:"levels"`
|
||||
Interval int `json:"interval"`
|
||||
} `json:"report"`
|
||||
Schedule struct {
|
||||
Core struct {
|
||||
Frequency int `json:"frequency"`
|
||||
Duration int `json:"duration"`
|
||||
} `json:"core"`
|
||||
Node struct {
|
||||
Frequency int `json:"frequency"`
|
||||
Duration int `json:"duration"`
|
||||
} `json:"node"`
|
||||
} `json:"schedule"`
|
||||
Metrics []string `json:"metrics"`
|
||||
CollectorPath string `json:"collector_path"`
|
||||
}
|
||||
|
||||
type CollectorConfig struct {
|
||||
Command string `json:"command"`
|
||||
Args string `json:"arguments"`
|
||||
Provides []string `json:"provides"`
|
||||
}
|
||||
|
||||
type InternalCollectorConfig struct {
|
||||
Config CollectorConfig
|
||||
Location string
|
||||
LastRun time.Time
|
||||
encoder *protocol.Encoder
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Load global configuration from JSON file
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
func LoadGlobalConfiguration(file string, config *GlobalConfig) error {
|
||||
configFile, err := os.Open(file)
|
||||
defer configFile.Close()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
jsonParser := json.NewDecoder(configFile)
|
||||
jsonParser.Decode(config)
|
||||
return err
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Load collector configuration from JSON file
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
func LoadCollectorConfiguration(file string, config *CollectorConfig) error {
|
||||
configFile, err := os.Open(file)
|
||||
defer configFile.Close()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
jsonParser := json.NewDecoder(configFile)
|
||||
jsonParser.Decode(config)
|
||||
return err
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Load collector configurations
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
func GetSingleCollector(folders *[]string) filepath.WalkFunc {
|
||||
return func(path string, info os.FileInfo, err error) error {
|
||||
if info.IsDir() {
|
||||
configfile := filepath.Join(path, "config.json")
|
||||
if _, err := os.Stat(configfile); err == nil {
|
||||
// TODO: Validate config?
|
||||
p, err := filepath.Abs(path)
|
||||
if err == nil {
|
||||
*folders = append(*folders, p)
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func GetCollectorFolders(root string, folders *[]string) error {
|
||||
err := filepath.Walk(root, GetSingleCollector(folders))
|
||||
if err != nil {
|
||||
err = errors.New("Cannot get collectors")
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Setup all collectors
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
func SetupCollectors(config GlobalConfig) ([]InternalCollectorConfig, error) {
|
||||
var folders []string
|
||||
var outconfig []InternalCollectorConfig
|
||||
//encoder := protocol.NewEncoder(buf)
|
||||
//encoder.SetMaxLineBytes(1024)
|
||||
GetCollectorFolders(config.CollectorPath, &folders)
|
||||
for _, path := range folders {
|
||||
var col_config InternalCollectorConfig
|
||||
LoadCollectorConfiguration(filepath.Join(path, "config.json"), &col_config.Config)
|
||||
col_config.LastRun = time.Now()
|
||||
col_config.Location = path
|
||||
//buf := &bytes.Buffer{}
|
||||
//col_config.Encoder := protocol.NewEncoder(buf)
|
||||
//col_config.Encoder.SetMaxLineBytes(1024)
|
||||
outconfig = append(outconfig, col_config)
|
||||
}
|
||||
return outconfig, nil
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Run collector
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
func RunCollector(config InternalCollectorConfig) ([]string, error) {
|
||||
var results []string
|
||||
var err error
|
||||
cmd := config.Config.Command
|
||||
|
||||
if _, err = os.Stat(cmd); err != nil {
|
||||
//fmt.Println(err.Error())
|
||||
if !strings.HasPrefix(cmd, "/") {
|
||||
cmd = filepath.Join(config.Location, config.Config.Command)
|
||||
if _, err = os.Stat(cmd); err != nil {
|
||||
//fmt.Println(err.Error())
|
||||
cmd, err = exec.LookPath(config.Config.Command)
|
||||
}
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
fmt.Println(err.Error())
|
||||
return results, err
|
||||
}
|
||||
|
||||
// TODO: Add timeout
|
||||
|
||||
command := exec.Command(cmd, config.Config.Args)
|
||||
command.Dir = config.Location
|
||||
command.Wait()
|
||||
stdout, err := command.Output()
|
||||
if err != nil {
|
||||
//log.error(err.Error())
|
||||
fmt.Println(err.Error())
|
||||
return results, err
|
||||
}
|
||||
|
||||
lines := strings.Split(string(stdout), "\n")
|
||||
|
||||
for _, l := range lines {
|
||||
if strings.HasPrefix(l, "#") {
|
||||
continue
|
||||
}
|
||||
results = append(results, l)
|
||||
}
|
||||
return results, err
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Setup sink
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
func SetupSink(config GlobalConfig) chan string {
|
||||
|
||||
c := make(chan string, 300)
|
||||
|
||||
// TODO: Setup something for sending? Establish HTTP connection?
|
||||
return c
|
||||
}
|
||||
|
||||
func RunSink(config GlobalConfig, queue *chan string) (*time.Ticker, chan bool) {
|
||||
|
||||
interval := time.Duration(config.Report.Interval) * time.Second
|
||||
ticker := time.NewTicker(interval)
|
||||
done := make(chan bool)
|
||||
|
||||
go func() {
|
||||
for {
|
||||
select {
|
||||
case <-done:
|
||||
return
|
||||
case t := <-ticker.C:
|
||||
fmt.Println("SinkTick at", t)
|
||||
empty := false
|
||||
var batch []string
|
||||
for empty == false {
|
||||
select {
|
||||
case metric := <-*queue:
|
||||
fmt.Println(metric)
|
||||
batch = append(batch, metric)
|
||||
default:
|
||||
// No metric available, wait for the next iteration
|
||||
empty = true
|
||||
break
|
||||
}
|
||||
}
|
||||
for _, m := range batch {
|
||||
fmt.Println(m)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
return ticker, done
|
||||
}
|
||||
|
||||
func CloseSink(config GlobalConfig, queue *chan string, ticker *time.Ticker, done chan bool) {
|
||||
ticker.Stop()
|
||||
done <- true
|
||||
close(*queue)
|
||||
}
|
||||
|
||||
func MainLoop(config GlobalConfig, sink *chan string) (*time.Ticker, chan bool) {
|
||||
var intConfig []InternalCollectorConfig
|
||||
intConfig, err := SetupCollectors(config)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
interval := time.Duration(config.Schedule.Node.Frequency) * time.Second
|
||||
ticker := time.NewTicker(time.Second)
|
||||
done := make(chan bool)
|
||||
|
||||
go func() {
|
||||
for {
|
||||
select {
|
||||
case <-done:
|
||||
return
|
||||
case t := <-ticker.C:
|
||||
fmt.Println("CollectorTick at", t)
|
||||
unix := time.Now()
|
||||
for i, _ := range intConfig {
|
||||
if time.Duration(unix.Sub(intConfig[i].LastRun)) > interval {
|
||||
res, err := RunCollector(intConfig[i])
|
||||
if err != nil {
|
||||
//log.error("Collector failed: ", err.Error())
|
||||
} else {
|
||||
//TODO: parse and skip in case of error, encode to []string
|
||||
for _, r := range res {
|
||||
if len(r) > 0 {
|
||||
*sink <- r
|
||||
}
|
||||
}
|
||||
}
|
||||
intConfig[i].LastRun = time.Now()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
return ticker, done
|
||||
}
|
||||
|
||||
func main() {
|
||||
// fmt.Println("Hello")
|
||||
// cmd_opts := []string{"la","le","lu"}
|
||||
// cmd := "echo"
|
||||
// s := run_cmd(cmd, cmd_opts)
|
||||
// fmt.Println(s)
|
||||
// tags := map[string]string {
|
||||
// "host" : "broadep2",
|
||||
// }
|
||||
// fields := map[string]interface{} {
|
||||
// "value" : float64(1.0),
|
||||
// }
|
||||
// fmt.Println(CreatePoint("flops_any", tags, fields, time.Now().UnixNano()))
|
||||
var config GlobalConfig
|
||||
LoadGlobalConfiguration("config.json", &config)
|
||||
|
||||
queue := SetupSink(config)
|
||||
sinkTicker, sinkDone := RunSink(config, &queue)
|
||||
collectTicker, collectDone := MainLoop(config, &queue)
|
||||
time.Sleep(1600 * time.Second)
|
||||
collectTicker.Stop()
|
||||
collectDone <- true
|
||||
CloseSink(config, &queue, sinkTicker, sinkDone)
|
||||
|
||||
// var folders []string
|
||||
// GetCollectorFolders(config.CollectorPath, &folders)
|
||||
|
||||
// for _, path := range folders {
|
||||
// var col_config CollectorConfig
|
||||
// LoadCollectorConfiguration(filepath.Join(path, "config.json"), &col_config)
|
||||
// stdout := run_cmd(filepath.Join(path, col_config.Command), col_config.Args)
|
||||
|
||||
// metrics := strings.Split(stdout, "\n")
|
||||
// for _, m := range metrics {
|
||||
// if len(m) > 0 {
|
||||
// t := strings.Fields(m)
|
||||
// if len(t) == 2 {
|
||||
// var s strings.Builder
|
||||
// fmt.Fprintf(&s, "%s %d", m, time.Now().UnixNano())
|
||||
// m = s.String()
|
||||
// }
|
||||
// fmt.Println("SEND", m)
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
}
|
@@ -1,190 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"os/signal"
|
||||
"strconv"
|
||||
"time"
|
||||
)
|
||||
|
||||
|
||||
// geht nicht
|
||||
//enum CollectScope {
|
||||
// Node: 0,
|
||||
// Socket,
|
||||
// Die,
|
||||
// LLC,
|
||||
// NUMA,
|
||||
// Core,
|
||||
// HWThread
|
||||
//}
|
||||
|
||||
//var scopeNames = map[CollectScope]string{
|
||||
// Node: "Node",
|
||||
// Socket: "Socket",
|
||||
// Die: "Die",
|
||||
// LLC: "LLC",
|
||||
// NUMA: "NUMA",
|
||||
// Core: "Core",
|
||||
// HWThread: "HWThread"
|
||||
//}
|
||||
|
||||
type CollectValue struct {
|
||||
Name string
|
||||
Value interface{}
|
||||
//scope CollectScope
|
||||
}
|
||||
|
||||
type InitFunc func() error
|
||||
type ReadFunc func(time.Duration) ([]CollectValue, error)
|
||||
type CloseFunc func() error
|
||||
type SinkFunc func([]CollectValue) error
|
||||
|
||||
func read_memavg(duration time.Duration) ([]CollectValue, error) {
|
||||
var values []CollectValue
|
||||
data, err := ioutil.ReadFile("/proc/meminfo")
|
||||
if err != nil {
|
||||
fmt.Println(err.Error())
|
||||
return values, err
|
||||
}
|
||||
var matches = map[string]string {
|
||||
"MemTotal" : "mem_total",
|
||||
"MemAvailable" : "mem_avail",
|
||||
"MemFree" : "mem_free",
|
||||
}
|
||||
lines := strings.Split(string(data), "\n")
|
||||
for _, l := range lines {
|
||||
for i,o := range matches {
|
||||
if strings.HasPrefix(l, i) {
|
||||
f := strings.Fields(l)
|
||||
v, err := strconv.ParseInt(f[1], 10, 0)
|
||||
if err == nil {
|
||||
var value CollectValue
|
||||
// value.Scope = Node
|
||||
value.Name = o
|
||||
value.Value = v
|
||||
values = append(values, value)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return values, nil
|
||||
}
|
||||
|
||||
func read_loadavg(duration time.Duration) ([]CollectValue, error) {
|
||||
var values []CollectValue
|
||||
data, err := ioutil.ReadFile("/proc/loadavg")
|
||||
if err != nil {
|
||||
fmt.Println(err.Error())
|
||||
return values, err
|
||||
}
|
||||
var matches = map[int]string {
|
||||
0 : "loadavg1m",
|
||||
1 : "loadavg5m",
|
||||
2 : "loadavg15m",
|
||||
}
|
||||
f := strings.Fields(string(data))
|
||||
for i, m := range matches {
|
||||
v, err := strconv.ParseFloat(f[i], 64)
|
||||
if err == nil {
|
||||
var value CollectValue
|
||||
value.Name = m
|
||||
value.Value = v
|
||||
// value.Scope = Node
|
||||
values = append(values, value)
|
||||
}
|
||||
}
|
||||
return values, nil
|
||||
}
|
||||
|
||||
func read_netstat(duration time.Duration) ([]CollectValue, error) {
|
||||
var values []CollectValue
|
||||
data, err := ioutil.ReadFile("/proc/net/dev")
|
||||
if err != nil {
|
||||
fmt.Println(err.Error())
|
||||
return values, err
|
||||
}
|
||||
var matches = map[int]string {
|
||||
1 : "bytes_in",
|
||||
9 : "bytes_out",
|
||||
2 : "pkts_in",
|
||||
10 : "pkts_out",
|
||||
}
|
||||
lines := strings.Split(string(data), "\n")
|
||||
for _, l := range lines {
|
||||
if ! strings.Contains(l, ":") {
|
||||
continue
|
||||
}
|
||||
f := strings.Fields(l)
|
||||
dev := f[0][0:len(f[0])-1]
|
||||
if dev == "lo" {
|
||||
continue
|
||||
}
|
||||
for i, m := range matches {
|
||||
v, err := strconv.ParseInt(f[i], 10, 0)
|
||||
if err == nil {
|
||||
var value CollectValue
|
||||
value.Name = fmt.Sprintf("%s_%s", dev, m)
|
||||
value.Value = v
|
||||
//value.Scope = Node
|
||||
values = append(values, value)
|
||||
}
|
||||
}
|
||||
}
|
||||
return values, nil
|
||||
}
|
||||
|
||||
func Send(values []CollectValue) error {
|
||||
for _, v := range values {
|
||||
fmt.Printf("Name: '%s' Value: '%v'\n", v.Name, v.Value)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func ReadAll(duration time.Duration, reads []ReadFunc, sink SinkFunc) {
|
||||
for _, f := range reads {
|
||||
values, err := f(duration)
|
||||
if err == nil {
|
||||
sink(values)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func ReadLoop(interval time.Duration, duration time.Duration, reads []ReadFunc, sink SinkFunc) {
|
||||
ticker := time.NewTicker(interval)
|
||||
done := make(chan bool)
|
||||
sigs := make(chan os.Signal, 1)
|
||||
signal.Notify(sigs, os.Interrupt)
|
||||
ReadAll(duration, reads, sink)
|
||||
go func() {
|
||||
<-sigs
|
||||
// Should call all CloseFunc functions here
|
||||
os.Exit(1)
|
||||
}()
|
||||
func() {
|
||||
select {
|
||||
case <-done:
|
||||
return
|
||||
case t := <-ticker.C:
|
||||
fmt.Println("Tick at", t)
|
||||
ReadAll(duration, reads, sink)
|
||||
}
|
||||
}()
|
||||
ticker.Stop()
|
||||
done <- true
|
||||
}
|
||||
|
||||
func main() {
|
||||
//var inits []InitFunc
|
||||
var reads = []ReadFunc {read_memavg, read_loadavg, read_netstat}
|
||||
//var closes []CloseFunc
|
||||
var duration time.Duration
|
||||
var interval time.Duration
|
||||
duration = time.Duration(1) * time.Second
|
||||
interval = time.Duration(10) * time.Second
|
||||
ReadLoop(interval, duration, reads, Send)
|
||||
return
|
||||
}
|
64
collectors/infinibandMetric.go
Normal file
64
collectors/infinibandMetric.go
Normal file
@@ -0,0 +1,64 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const LIDFILE = `/sys/class/infiniband/mlx4_0/ports/1/lid`
|
||||
|
||||
type InfinibandCollector struct {
|
||||
MetricCollector
|
||||
}
|
||||
|
||||
func (m *InfinibandCollector) Init() {
|
||||
m.name = "InfinibandCollector"
|
||||
m.setup()
|
||||
}
|
||||
|
||||
func (m *InfinibandCollector) Read(interval time.Duration) {
|
||||
buffer, err := ioutil.ReadFile(string(LIDFILE))
|
||||
|
||||
if err != nil {
|
||||
log.Print(err)
|
||||
return
|
||||
}
|
||||
|
||||
args := fmt.Sprintf("-r %s 1 0xf000", string(buffer))
|
||||
|
||||
command := exec.Command("/usr/sbin/perfquery", args)
|
||||
command.Wait()
|
||||
stdout, err := command.Output()
|
||||
if err != nil {
|
||||
log.Print(err)
|
||||
return
|
||||
}
|
||||
|
||||
ll := strings.Split(string(stdout), "\n")
|
||||
|
||||
for _, line := range ll {
|
||||
if strings.HasPrefix(line, "PortRcvData") || strings.HasPrefix(line, "RcvData") {
|
||||
lv := strings.Fields(line)
|
||||
v, err := strconv.ParseFloat(lv[1], 64)
|
||||
if err == nil {
|
||||
m.node["ib_recv"] = float64(v)
|
||||
}
|
||||
}
|
||||
if strings.HasPrefix(line, "PortXmitData") || strings.HasPrefix(line, "XmtData") {
|
||||
lv := strings.Fields(line)
|
||||
v, err := strconv.ParseFloat(lv[1], 64)
|
||||
if err == nil {
|
||||
m.node["ib_xmit"] = float64(v)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (m *InfinibandCollector) Close() {
|
||||
return
|
||||
}
|
@@ -1,64 +0,0 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
protocol "github.com/influxdata/line-protocol"
|
||||
)
|
||||
|
||||
type LikwidCollector struct {
|
||||
name string
|
||||
tags []*protocol.Tag
|
||||
fields []*protocol.Field
|
||||
t time.Time
|
||||
encoder *protocol.Encoder
|
||||
}
|
||||
|
||||
func (c *LikwidCollector) Name() string {
|
||||
return c.name
|
||||
}
|
||||
func (c *LikwidCollector) TagList() []*protocol.Tag {
|
||||
return c.tags
|
||||
}
|
||||
|
||||
func (c *LikwidCollector) FieldList() []*protocol.Field {
|
||||
return c.fields
|
||||
}
|
||||
|
||||
func (c *LikwidCollector) Time() time.Time {
|
||||
return c.t
|
||||
}
|
||||
|
||||
func (c *LikwidCollector) New() {
|
||||
buf := &bytes.Buffer{}
|
||||
c.encoder = protocol.NewEncoder(buf)
|
||||
c.encoder.SetMaxLineBytes(1024)
|
||||
}
|
||||
|
||||
func (c *LikwidCollector) Start(
|
||||
level string,
|
||||
frequency time.Duration,
|
||||
duration int) {
|
||||
ticker := time.NewTicker(frequency * time.Second)
|
||||
done := make(chan bool)
|
||||
|
||||
go func() {
|
||||
for {
|
||||
select {
|
||||
case <-done:
|
||||
return
|
||||
case t := <-ticker.C:
|
||||
fmt.Println("Tick at", t)
|
||||
|
||||
c.encoder.Encode(c)
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
time.Sleep(1600 * time.Second)
|
||||
ticker.Stop()
|
||||
done <- true
|
||||
fmt.Println("Ticker stopped")
|
||||
}
|
301
collectors/likwid/bstrlib.h
Normal file
301
collectors/likwid/bstrlib.h
Normal file
@@ -0,0 +1,301 @@
|
||||
/*
|
||||
* =======================================================================================
|
||||
* This source file is part of the bstring string library. This code was
|
||||
* written by Paul Hsieh in 2002-2008, and is covered by the BSD open source
|
||||
* license and the GPL. Refer to the accompanying documentation for details
|
||||
* on usage and license.
|
||||
*/
|
||||
/*
|
||||
* bstrlib.c
|
||||
*
|
||||
* This file is the core module for implementing the bstring functions.
|
||||
*/
|
||||
|
||||
#ifndef BSTRLIB_INCLUDE
|
||||
#define BSTRLIB_INCLUDE
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
#include <ctype.h>
|
||||
|
||||
#if !defined (BSTRLIB_VSNP_OK) && !defined (BSTRLIB_NOVSNP)
|
||||
# if defined (__TURBOC__) && !defined (__BORLANDC__)
|
||||
# define BSTRLIB_NOVSNP
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define BSTR_ERR (-1)
|
||||
#define BSTR_OK (0)
|
||||
#define BSTR_BS_BUFF_LENGTH_GET (0)
|
||||
|
||||
typedef struct tagbstring * bstring;
|
||||
typedef const struct tagbstring * const_bstring;
|
||||
|
||||
/* Copy functions */
|
||||
#define cstr2bstr bfromcstr
|
||||
extern bstring bfromcstr (const char * str);
|
||||
extern bstring bfromcstralloc (int mlen, const char * str);
|
||||
extern bstring blk2bstr (const void * blk, int len);
|
||||
extern char * bstr2cstr (const_bstring s, char z);
|
||||
extern int bcstrfree (char * s);
|
||||
extern bstring bstrcpy (const_bstring b1);
|
||||
extern int bassign (bstring a, const_bstring b);
|
||||
extern int bassignmidstr (bstring a, const_bstring b, int left, int len);
|
||||
extern int bassigncstr (bstring a, const char * str);
|
||||
extern int bassignblk (bstring a, const void * s, int len);
|
||||
|
||||
/* Destroy function */
|
||||
extern int bdestroy (bstring b);
|
||||
|
||||
/* Space allocation hinting functions */
|
||||
extern int balloc (bstring s, int len);
|
||||
extern int ballocmin (bstring b, int len);
|
||||
|
||||
/* Substring extraction */
|
||||
extern bstring bmidstr (const_bstring b, int left, int len);
|
||||
|
||||
/* Various standard manipulations */
|
||||
extern int bconcat (bstring b0, const_bstring b1);
|
||||
extern int bconchar (bstring b0, char c);
|
||||
extern int bcatcstr (bstring b, const char * s);
|
||||
extern int bcatblk (bstring b, const void * s, int len);
|
||||
extern int binsert (bstring s1, int pos, const_bstring s2, unsigned char fill);
|
||||
extern int binsertch (bstring s1, int pos, int len, unsigned char fill);
|
||||
extern int breplace (bstring b1, int pos, int len, const_bstring b2, unsigned char fill);
|
||||
extern int bdelete (bstring s1, int pos, int len);
|
||||
extern int bsetstr (bstring b0, int pos, const_bstring b1, unsigned char fill);
|
||||
extern int btrunc (bstring b, int n);
|
||||
|
||||
/* Scan/search functions */
|
||||
extern int bstricmp (const_bstring b0, const_bstring b1);
|
||||
extern int bstrnicmp (const_bstring b0, const_bstring b1, int n);
|
||||
extern int biseqcaseless (const_bstring b0, const_bstring b1);
|
||||
extern int bisstemeqcaselessblk (const_bstring b0, const void * blk, int len);
|
||||
extern int biseq (const_bstring b0, const_bstring b1);
|
||||
extern int bisstemeqblk (const_bstring b0, const void * blk, int len);
|
||||
extern int biseqcstr (const_bstring b, const char * s);
|
||||
extern int biseqcstrcaseless (const_bstring b, const char * s);
|
||||
extern int bstrcmp (const_bstring b0, const_bstring b1);
|
||||
extern int bstrncmp (const_bstring b0, const_bstring b1, int n);
|
||||
extern int binstr (const_bstring s1, int pos, const_bstring s2);
|
||||
extern int binstrr (const_bstring s1, int pos, const_bstring s2);
|
||||
extern int binstrcaseless (const_bstring s1, int pos, const_bstring s2);
|
||||
extern int binstrrcaseless (const_bstring s1, int pos, const_bstring s2);
|
||||
extern int bstrchrp (const_bstring b, int c, int pos);
|
||||
extern int bstrrchrp (const_bstring b, int c, int pos);
|
||||
#define bstrchr(b,c) bstrchrp ((b), (c), 0)
|
||||
#define bstrrchr(b,c) bstrrchrp ((b), (c), blength(b)-1)
|
||||
extern int binchr (const_bstring b0, int pos, const_bstring b1);
|
||||
extern int binchrr (const_bstring b0, int pos, const_bstring b1);
|
||||
extern int bninchr (const_bstring b0, int pos, const_bstring b1);
|
||||
extern int bninchrr (const_bstring b0, int pos, const_bstring b1);
|
||||
extern int bfindreplace (bstring b, const_bstring find, const_bstring repl, int pos);
|
||||
extern int bfindreplacecaseless (bstring b, const_bstring find, const_bstring repl, int pos);
|
||||
|
||||
/* List of string container functions */
|
||||
struct bstrList {
|
||||
int qty, mlen;
|
||||
bstring * entry;
|
||||
};
|
||||
extern struct bstrList * bstrListCreate (void);
|
||||
extern int bstrListDestroy (struct bstrList * sl);
|
||||
extern int bstrListAlloc (struct bstrList * sl, int msz);
|
||||
extern int bstrListAllocMin (struct bstrList * sl, int msz);
|
||||
|
||||
/* String split and join functions */
|
||||
extern struct bstrList * bsplit (const_bstring str, unsigned char splitChar);
|
||||
extern struct bstrList * bsplits (const_bstring str, const_bstring splitStr);
|
||||
extern struct bstrList * bsplitstr (const_bstring str, const_bstring splitStr);
|
||||
extern bstring bjoin (const struct bstrList * bl, const_bstring sep);
|
||||
extern int bsplitcb (const_bstring str, unsigned char splitChar, int pos,
|
||||
int (* cb) (void * parm, int ofs, int len), void * parm);
|
||||
extern int bsplitscb (const_bstring str, const_bstring splitStr, int pos,
|
||||
int (* cb) (void * parm, int ofs, int len), void * parm);
|
||||
extern int bsplitstrcb (const_bstring str, const_bstring splitStr, int pos,
|
||||
int (* cb) (void * parm, int ofs, int len), void * parm);
|
||||
|
||||
/* Miscellaneous functions */
|
||||
extern int bpattern (bstring b, int len);
|
||||
extern int btoupper (bstring b);
|
||||
extern int btolower (bstring b);
|
||||
extern int bltrimws (bstring b);
|
||||
extern int brtrimws (bstring b);
|
||||
extern int btrimws (bstring b);
|
||||
|
||||
#if !defined (BSTRLIB_NOVSNP)
|
||||
extern bstring bformat (const char * fmt, ...);
|
||||
extern int bformata (bstring b, const char * fmt, ...);
|
||||
extern int bassignformat (bstring b, const char * fmt, ...);
|
||||
extern int bvcformata (bstring b, int count, const char * fmt, va_list arglist);
|
||||
|
||||
#define bvformata(ret, b, fmt, lastarg) { \
|
||||
bstring bstrtmp_b = (b); \
|
||||
const char * bstrtmp_fmt = (fmt); \
|
||||
int bstrtmp_r = BSTR_ERR, bstrtmp_sz = 16; \
|
||||
for (;;) { \
|
||||
va_list bstrtmp_arglist; \
|
||||
va_start (bstrtmp_arglist, lastarg); \
|
||||
bstrtmp_r = bvcformata (bstrtmp_b, bstrtmp_sz, bstrtmp_fmt, bstrtmp_arglist); \
|
||||
va_end (bstrtmp_arglist); \
|
||||
if (bstrtmp_r >= 0) { /* Everything went ok */ \
|
||||
bstrtmp_r = BSTR_OK; \
|
||||
break; \
|
||||
} else if (-bstrtmp_r <= bstrtmp_sz) { /* A real error? */ \
|
||||
bstrtmp_r = BSTR_ERR; \
|
||||
break; \
|
||||
} \
|
||||
bstrtmp_sz = -bstrtmp_r; /* Doubled or target size */ \
|
||||
} \
|
||||
ret = bstrtmp_r; \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
typedef int (*bNgetc) (void *parm);
|
||||
typedef size_t (* bNread) (void *buff, size_t elsize, size_t nelem, void *parm);
|
||||
|
||||
/* Input functions */
|
||||
extern bstring bgets (bNgetc getcPtr, void * parm, char terminator);
|
||||
extern bstring bread (bNread readPtr, void * parm);
|
||||
extern int bgetsa (bstring b, bNgetc getcPtr, void * parm, char terminator);
|
||||
extern int bassigngets (bstring b, bNgetc getcPtr, void * parm, char terminator);
|
||||
extern int breada (bstring b, bNread readPtr, void * parm);
|
||||
|
||||
/* Stream functions */
|
||||
extern struct bStream * bsopen (bNread readPtr, void * parm);
|
||||
extern void * bsclose (struct bStream * s);
|
||||
extern int bsbufflength (struct bStream * s, int sz);
|
||||
extern int bsreadln (bstring b, struct bStream * s, char terminator);
|
||||
extern int bsreadlns (bstring r, struct bStream * s, const_bstring term);
|
||||
extern int bsread (bstring b, struct bStream * s, int n);
|
||||
extern int bsreadlna (bstring b, struct bStream * s, char terminator);
|
||||
extern int bsreadlnsa (bstring r, struct bStream * s, const_bstring term);
|
||||
extern int bsreada (bstring b, struct bStream * s, int n);
|
||||
extern int bsunread (struct bStream * s, const_bstring b);
|
||||
extern int bspeek (bstring r, const struct bStream * s);
|
||||
extern int bssplitscb (struct bStream * s, const_bstring splitStr,
|
||||
int (* cb) (void * parm, int ofs, const_bstring entry), void * parm);
|
||||
extern int bssplitstrcb (struct bStream * s, const_bstring splitStr,
|
||||
int (* cb) (void * parm, int ofs, const_bstring entry), void * parm);
|
||||
extern int bseof (const struct bStream * s);
|
||||
|
||||
struct tagbstring {
|
||||
int mlen;
|
||||
int slen;
|
||||
unsigned char * data;
|
||||
};
|
||||
|
||||
/* Accessor macros */
|
||||
#define blengthe(b, e) (((b) == (void *)0 || (b)->slen < 0) ? (int)(e) : ((b)->slen))
|
||||
#define blength(b) (blengthe ((b), 0))
|
||||
#define bdataofse(b, o, e) (((b) == (void *)0 || (b)->data == (void*)0) ? (char *)(e) : ((char *)(b)->data) + (o))
|
||||
#define bdataofs(b, o) (bdataofse ((b), (o), (void *)0))
|
||||
#define bdatae(b, e) (bdataofse (b, 0, e))
|
||||
#define bdata(b) (bdataofs (b, 0))
|
||||
#define bchare(b, p, e) ((((unsigned)(p)) < (unsigned)blength(b)) ? ((b)->data[(p)]) : (e))
|
||||
#define bchar(b, p) bchare ((b), (p), '\0')
|
||||
|
||||
/* Static constant string initialization macro */
|
||||
#define bsStaticMlen(q,m) {(m), (int) sizeof(q)-1, (unsigned char *) ("" q "")}
|
||||
#if defined(_MSC_VER)
|
||||
# define bsStatic(q) bsStaticMlen(q,-32)
|
||||
#endif
|
||||
#ifndef bsStatic
|
||||
# define bsStatic(q) bsStaticMlen(q,-__LINE__)
|
||||
#endif
|
||||
|
||||
/* Static constant block parameter pair */
|
||||
#define bsStaticBlkParms(q) ((void *)("" q "")), ((int) sizeof(q)-1)
|
||||
|
||||
/* Reference building macros */
|
||||
#define cstr2tbstr btfromcstr
|
||||
#define btfromcstr(t,s) { \
|
||||
(t).data = (unsigned char *) (s); \
|
||||
(t).slen = ((t).data) ? ((int) (strlen) ((char *)(t).data)) : 0; \
|
||||
(t).mlen = -1; \
|
||||
}
|
||||
#define blk2tbstr(t,s,l) { \
|
||||
(t).data = (unsigned char *) (s); \
|
||||
(t).slen = l; \
|
||||
(t).mlen = -1; \
|
||||
}
|
||||
#define btfromblk(t,s,l) blk2tbstr(t,s,l)
|
||||
#define bmid2tbstr(t,b,p,l) { \
|
||||
const_bstring bstrtmp_s = (b); \
|
||||
if (bstrtmp_s && bstrtmp_s->data && bstrtmp_s->slen >= 0) { \
|
||||
int bstrtmp_left = (p); \
|
||||
int bstrtmp_len = (l); \
|
||||
if (bstrtmp_left < 0) { \
|
||||
bstrtmp_len += bstrtmp_left; \
|
||||
bstrtmp_left = 0; \
|
||||
} \
|
||||
if (bstrtmp_len > bstrtmp_s->slen - bstrtmp_left) \
|
||||
bstrtmp_len = bstrtmp_s->slen - bstrtmp_left; \
|
||||
if (bstrtmp_len <= 0) { \
|
||||
(t).data = (unsigned char *)""; \
|
||||
(t).slen = 0; \
|
||||
} else { \
|
||||
(t).data = bstrtmp_s->data + bstrtmp_left; \
|
||||
(t).slen = bstrtmp_len; \
|
||||
} \
|
||||
} else { \
|
||||
(t).data = (unsigned char *)""; \
|
||||
(t).slen = 0; \
|
||||
} \
|
||||
(t).mlen = -__LINE__; \
|
||||
}
|
||||
#define btfromblkltrimws(t,s,l) { \
|
||||
int bstrtmp_idx = 0, bstrtmp_len = (l); \
|
||||
unsigned char * bstrtmp_s = (s); \
|
||||
if (bstrtmp_s && bstrtmp_len >= 0) { \
|
||||
for (; bstrtmp_idx < bstrtmp_len; bstrtmp_idx++) { \
|
||||
if (!isspace (bstrtmp_s[bstrtmp_idx])) break; \
|
||||
} \
|
||||
} \
|
||||
(t).data = bstrtmp_s + bstrtmp_idx; \
|
||||
(t).slen = bstrtmp_len - bstrtmp_idx; \
|
||||
(t).mlen = -__LINE__; \
|
||||
}
|
||||
#define btfromblkrtrimws(t,s,l) { \
|
||||
int bstrtmp_len = (l) - 1; \
|
||||
unsigned char * bstrtmp_s = (s); \
|
||||
if (bstrtmp_s && bstrtmp_len >= 0) { \
|
||||
for (; bstrtmp_len >= 0; bstrtmp_len--) { \
|
||||
if (!isspace (bstrtmp_s[bstrtmp_len])) break; \
|
||||
} \
|
||||
} \
|
||||
(t).data = bstrtmp_s; \
|
||||
(t).slen = bstrtmp_len + 1; \
|
||||
(t).mlen = -__LINE__; \
|
||||
}
|
||||
#define btfromblktrimws(t,s,l) { \
|
||||
int bstrtmp_idx = 0, bstrtmp_len = (l) - 1; \
|
||||
unsigned char * bstrtmp_s = (s); \
|
||||
if (bstrtmp_s && bstrtmp_len >= 0) { \
|
||||
for (; bstrtmp_idx <= bstrtmp_len; bstrtmp_idx++) { \
|
||||
if (!isspace (bstrtmp_s[bstrtmp_idx])) break; \
|
||||
} \
|
||||
for (; bstrtmp_len >= bstrtmp_idx; bstrtmp_len--) { \
|
||||
if (!isspace (bstrtmp_s[bstrtmp_len])) break; \
|
||||
} \
|
||||
} \
|
||||
(t).data = bstrtmp_s + bstrtmp_idx; \
|
||||
(t).slen = bstrtmp_len + 1 - bstrtmp_idx; \
|
||||
(t).mlen = -__LINE__; \
|
||||
}
|
||||
|
||||
/* Write protection macros */
|
||||
#define bwriteprotect(t) { if ((t).mlen >= 0) (t).mlen = -1; }
|
||||
#define bwriteallow(t) { if ((t).mlen == -1) (t).mlen = (t).slen + ((t).slen == 0); }
|
||||
#define biswriteprotected(t) ((t).mlen <= 0)
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
31
collectors/likwid/groups/CLX/BRANCH.txt
Normal file
31
collectors/likwid/groups/CLX/BRANCH.txt
Normal file
@@ -0,0 +1,31 @@
|
||||
SHORT Branch prediction miss rate/ratio
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 BR_INST_RETIRED_ALL_BRANCHES
|
||||
PMC1 BR_MISP_RETIRED_ALL_BRANCHES
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Branch rate PMC0/FIXC0
|
||||
Branch misprediction rate PMC1/FIXC0
|
||||
Branch misprediction ratio PMC1/PMC0
|
||||
Instructions per branch FIXC0/PMC0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
|
||||
Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
|
||||
Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
|
||||
Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
|
||||
-
|
||||
The rates state how often on average a branch or a mispredicted branch occurred
|
||||
per instruction retired in total. The branch misprediction ratio sets directly
|
||||
into relation what ratio of all branch instruction where mispredicted.
|
||||
Instructions per branch is 1/branch rate.
|
||||
|
143
collectors/likwid/groups/CLX/CACHES.txt
Normal file
143
collectors/likwid/groups/CLX/CACHES.txt
Normal file
@@ -0,0 +1,143 @@
|
||||
SHORT Cache bandwidth in MBytes/s
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 L1D_REPLACEMENT
|
||||
PMC1 L1D_M_EVICT
|
||||
PMC2 L2_LINES_IN_ALL
|
||||
PMC3 L2_TRANS_L2_WB
|
||||
CBOX0C1 LLC_VICTIMS_M_STATE
|
||||
CBOX1C1 LLC_VICTIMS_M_STATE
|
||||
CBOX2C1 LLC_VICTIMS_M_STATE
|
||||
CBOX3C1 LLC_VICTIMS_M_STATE
|
||||
CBOX4C1 LLC_VICTIMS_M_STATE
|
||||
CBOX5C1 LLC_VICTIMS_M_STATE
|
||||
CBOX6C1 LLC_VICTIMS_M_STATE
|
||||
CBOX7C1 LLC_VICTIMS_M_STATE
|
||||
CBOX8C1 LLC_VICTIMS_M_STATE
|
||||
CBOX9C1 LLC_VICTIMS_M_STATE
|
||||
CBOX10C1 LLC_VICTIMS_M_STATE
|
||||
CBOX11C1 LLC_VICTIMS_M_STATE
|
||||
CBOX12C1 LLC_VICTIMS_M_STATE
|
||||
CBOX13C1 LLC_VICTIMS_M_STATE
|
||||
CBOX14C1 LLC_VICTIMS_M_STATE
|
||||
CBOX15C1 LLC_VICTIMS_M_STATE
|
||||
CBOX16C1 LLC_VICTIMS_M_STATE
|
||||
CBOX17C1 LLC_VICTIMS_M_STATE
|
||||
CBOX18C1 LLC_VICTIMS_M_STATE
|
||||
CBOX19C1 LLC_VICTIMS_M_STATE
|
||||
CBOX20C1 LLC_VICTIMS_M_STATE
|
||||
CBOX21C1 LLC_VICTIMS_M_STATE
|
||||
CBOX22C1 LLC_VICTIMS_M_STATE
|
||||
CBOX23C1 LLC_VICTIMS_M_STATE
|
||||
CBOX24C1 LLC_VICTIMS_M_STATE
|
||||
CBOX25C1 LLC_VICTIMS_M_STATE
|
||||
CBOX26C1 LLC_VICTIMS_M_STATE
|
||||
CBOX27C1 LLC_VICTIMS_M_STATE
|
||||
CBOX0C0 LLC_LOOKUP_DATA_READ
|
||||
CBOX1C0 LLC_LOOKUP_DATA_READ
|
||||
CBOX2C0 LLC_LOOKUP_DATA_READ
|
||||
CBOX3C0 LLC_LOOKUP_DATA_READ
|
||||
CBOX4C0 LLC_LOOKUP_DATA_READ
|
||||
CBOX5C0 LLC_LOOKUP_DATA_READ
|
||||
CBOX6C0 LLC_LOOKUP_DATA_READ
|
||||
CBOX7C0 LLC_LOOKUP_DATA_READ
|
||||
CBOX8C0 LLC_LOOKUP_DATA_READ
|
||||
CBOX9C0 LLC_LOOKUP_DATA_READ
|
||||
CBOX10C0 LLC_LOOKUP_DATA_READ
|
||||
CBOX11C0 LLC_LOOKUP_DATA_READ
|
||||
CBOX12C0 LLC_LOOKUP_DATA_READ
|
||||
CBOX13C0 LLC_LOOKUP_DATA_READ
|
||||
CBOX14C0 LLC_LOOKUP_DATA_READ
|
||||
CBOX15C0 LLC_LOOKUP_DATA_READ
|
||||
CBOX16C0 LLC_LOOKUP_DATA_READ
|
||||
CBOX17C0 LLC_LOOKUP_DATA_READ
|
||||
CBOX18C0 LLC_LOOKUP_DATA_READ
|
||||
CBOX19C0 LLC_LOOKUP_DATA_READ
|
||||
CBOX20C0 LLC_LOOKUP_DATA_READ
|
||||
CBOX21C0 LLC_LOOKUP_DATA_READ
|
||||
CBOX22C0 LLC_LOOKUP_DATA_READ
|
||||
CBOX23C0 LLC_LOOKUP_DATA_READ
|
||||
CBOX24C0 LLC_LOOKUP_DATA_READ
|
||||
CBOX25C0 LLC_LOOKUP_DATA_READ
|
||||
CBOX26C0 LLC_LOOKUP_DATA_READ
|
||||
CBOX27C0 LLC_LOOKUP_DATA_READ
|
||||
MBOX0C0 CAS_COUNT_RD
|
||||
MBOX0C1 CAS_COUNT_WR
|
||||
MBOX1C0 CAS_COUNT_RD
|
||||
MBOX1C1 CAS_COUNT_WR
|
||||
MBOX2C0 CAS_COUNT_RD
|
||||
MBOX2C1 CAS_COUNT_WR
|
||||
MBOX3C0 CAS_COUNT_RD
|
||||
MBOX3C1 CAS_COUNT_WR
|
||||
MBOX4C0 CAS_COUNT_RD
|
||||
MBOX4C1 CAS_COUNT_WR
|
||||
MBOX5C0 CAS_COUNT_RD
|
||||
MBOX5C1 CAS_COUNT_WR
|
||||
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
|
||||
L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0
|
||||
L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
|
||||
L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0
|
||||
L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
|
||||
L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
|
||||
L3 to L2 load bandwidth [MBytes/s] 1.0E-06*PMC2*64.0/time
|
||||
L3 to L2 load data volume [GBytes] 1.0E-09*PMC2*64.0
|
||||
L2 to L3 evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time
|
||||
L2 to L3 evict data volume [GBytes] 1.0E-09*PMC3*64.0
|
||||
L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
|
||||
L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
|
||||
System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0)*64.0/time
|
||||
System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0)*64.0
|
||||
L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64/time
|
||||
L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64
|
||||
L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64.0/time
|
||||
L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64.0
|
||||
Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time
|
||||
Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0
|
||||
Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
|
||||
Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
|
||||
Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
|
||||
Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
|
||||
L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64
|
||||
L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
|
||||
L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64
|
||||
L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
|
||||
L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
|
||||
L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time
|
||||
L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64
|
||||
L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time
|
||||
L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64
|
||||
L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
|
||||
L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
|
||||
System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ))*64/time
|
||||
System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ))*64
|
||||
L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M_STATE))*64/time
|
||||
L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M_STATE))*64
|
||||
L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M_STATE))*64/time
|
||||
L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M_STATE))*64
|
||||
Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time
|
||||
Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
|
||||
Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time
|
||||
Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
|
||||
Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time
|
||||
Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
|
||||
-
|
||||
Group to measure cache transfers between L1 and Memory. Please notice that the
|
||||
L3 to/from system metrics contain any traffic to the system (memory,
|
||||
Intel QPI, etc.) but don't seem to handle anything because commonly memory read
|
||||
bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth.
|
||||
|
26
collectors/likwid/groups/CLX/CLOCK.txt
Normal file
26
collectors/likwid/groups/CLX/CLOCK.txt
Normal file
@@ -0,0 +1,26 @@
|
||||
SHORT Power and Energy consumption
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PWR0 PWR_PKG_ENERGY
|
||||
UBOXFIX UNCORE_CLOCK
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
Uncore Clock [MHz] 1.E-06*UBOXFIX/time
|
||||
CPI FIXC1/FIXC0
|
||||
Energy [J] PWR0
|
||||
Power [W] PWR0/time
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Power = PWR_PKG_ENERGY / time
|
||||
Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
|
||||
-
|
||||
Broadwell implements the new RAPL interface. This interface enables to
|
||||
monitor the consumed energy on the package (socket) level.
|
||||
|
38
collectors/likwid/groups/CLX/CYCLE_ACTIVITY.txt
Normal file
38
collectors/likwid/groups/CLX/CYCLE_ACTIVITY.txt
Normal file
@@ -0,0 +1,38 @@
|
||||
SHORT Cycle Activities
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING
|
||||
PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING
|
||||
PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING
|
||||
PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Cycles without execution [%] (PMC3/FIXC1)*100
|
||||
Cycles without execution due to L1D [%] (PMC2/FIXC1)*100
|
||||
Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
|
||||
Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
|
||||
Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
|
||||
Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
|
||||
Cycles without execution due to memory loads [%] = CYCLE_ACTIVITY_CYCLES_LDM_PENDING/CPU_CLK_UNHALTED_CORE*100
|
||||
--
|
||||
This performance group measures the cycles while waiting for data from the cache
|
||||
and memory hierarchy.
|
||||
CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on
|
||||
any execution port.
|
||||
CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is
|
||||
outstanding.
|
||||
CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is
|
||||
outstanding.
|
||||
CYCLE_ACTIVITY_CYCLES_LDM_PENDING: Cycles while memory subsystem has an
|
||||
outstanding load.
|
45
collectors/likwid/groups/CLX/CYCLE_STALLS.txt
Normal file
45
collectors/likwid/groups/CLX/CYCLE_STALLS.txt
Normal file
@@ -0,0 +1,45 @@
|
||||
SHORT Cycle Activities (Stalls)
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING
|
||||
PMC1 CYCLE_ACTIVITY_STALLS_LDM_PENDING
|
||||
PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING
|
||||
PMC3 CYCLE_ACTIVITY_STALLS_TOTAL
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Total execution stalls PMC3
|
||||
Stalls caused by L1D misses [%] (PMC2/PMC3)*100
|
||||
Stalls caused by L2 misses [%] (PMC0/PMC3)*100
|
||||
Stalls caused by memory loads [%] (PMC1/PMC3)*100
|
||||
Execution stall rate [%] (PMC3/FIXC1)*100
|
||||
Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100
|
||||
Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
|
||||
Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
|
||||
Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
|
||||
Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
|
||||
Stalls caused by memory loads [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
|
||||
Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100
|
||||
Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100
|
||||
Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100
|
||||
Stalls caused by memory loads rate [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CPU_CLK_UNHALTED_CORE)*100
|
||||
--
|
||||
This performance group measures the stalls caused by data traffic in the cache
|
||||
hierarchy.
|
||||
CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls.
|
||||
CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand
|
||||
load is outstanding.
|
||||
CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand
|
||||
load is outstanding.
|
||||
CYCLE_ACTIVITY_STALLS_LDM_PENDING: Execution stalls while memory subsystem has
|
||||
an outstanding load.
|
22
collectors/likwid/groups/CLX/DATA.txt
Normal file
22
collectors/likwid/groups/CLX/DATA.txt
Normal file
@@ -0,0 +1,22 @@
|
||||
SHORT Load to store ratio
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 MEM_INST_RETIRED_ALL_LOADS
|
||||
PMC1 MEM_INST_RETIRED_ALL_STORES
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Load to store ratio PMC0/PMC1
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Load to store ratio = MEM_INST_RETIRED_ALL_LOADS/MEM_INST_RETIRED_ALL_STORES
|
||||
-
|
||||
This is a metric to determine your load to store ratio.
|
||||
|
24
collectors/likwid/groups/CLX/DIVIDE.txt
Normal file
24
collectors/likwid/groups/CLX/DIVIDE.txt
Normal file
@@ -0,0 +1,24 @@
|
||||
SHORT Divide unit information
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 ARITH_DIVIDER_COUNT
|
||||
PMC1 ARITH_DIVIDER_ACTIVE
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Number of divide ops PMC0
|
||||
Avg. divide unit usage duration PMC1/PMC0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Number of divide ops = ARITH_DIVIDER_COUNT
|
||||
Avg. divide unit usage duration = ARITH_DIVIDER_ACTIVE/ARITH_DIVIDER_COUNT
|
||||
--
|
||||
This performance group measures the average latency of divide operations
|
35
collectors/likwid/groups/CLX/ENERGY.txt
Normal file
35
collectors/likwid/groups/CLX/ENERGY.txt
Normal file
@@ -0,0 +1,35 @@
|
||||
SHORT Power and Energy consumption
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
TMP0 TEMP_CORE
|
||||
PWR0 PWR_PKG_ENERGY
|
||||
PWR1 PWR_PP0_ENERGY
|
||||
PWR3 PWR_DRAM_ENERGY
|
||||
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Temperature [C] TMP0
|
||||
Energy [J] PWR0
|
||||
Power [W] PWR0/time
|
||||
Energy PP0 [J] PWR1
|
||||
Power PP0 [W] PWR1/time
|
||||
Energy DRAM [J] PWR3
|
||||
Power DRAM [W] PWR3/time
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Power = PWR_PKG_ENERGY / time
|
||||
Power PP0 = PWR_PP0_ENERGY / time
|
||||
Power DRAM = PWR_DRAM_ENERGY / time
|
||||
-
|
||||
Broadwell implements the new RAPL interface. This interface enables to
|
||||
monitor the consumed energy on the package (socket) and DRAM level.
|
||||
|
25
collectors/likwid/groups/CLX/FLOPS_AVX.txt
Normal file
25
collectors/likwid/groups/CLX/FLOPS_AVX.txt
Normal file
@@ -0,0 +1,25 @@
|
||||
SHORT Packed AVX MFLOP/s
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
|
||||
PMC1 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
|
||||
PMC2 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
|
||||
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Packed SP [MFLOP/s] 1.0E-06*(PMC0*8.0+PMC2*16.0)/time
|
||||
Packed DP [MFLOP/s] 1.0E-06*(PMC1*4.0+PMC3*8.0)/time
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
|
||||
Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
|
||||
-
|
||||
Packed 32b AVX FLOPs rates.
|
34
collectors/likwid/groups/CLX/FLOPS_DP.txt
Normal file
34
collectors/likwid/groups/CLX/FLOPS_DP.txt
Normal file
@@ -0,0 +1,34 @@
|
||||
SHORT Double Precision MFLOP/s
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
|
||||
PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
|
||||
PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
|
||||
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time
|
||||
AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time
|
||||
AVX512 DP [MFLOP/s] 1.0E-06*(PMC3*8.0)/time
|
||||
Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time
|
||||
Scalar [MUOPS/s] 1.0E-06*PMC1/time
|
||||
Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
|
||||
AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
|
||||
AVX512 DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
|
||||
Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime
|
||||
Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
|
||||
Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)
|
||||
-
|
||||
SSE scalar and packed double precision FLOP rates.
|
||||
|
34
collectors/likwid/groups/CLX/FLOPS_SP.txt
Normal file
34
collectors/likwid/groups/CLX/FLOPS_SP.txt
Normal file
@@ -0,0 +1,34 @@
|
||||
SHORT Single Precision MFLOP/s
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
|
||||
PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE
|
||||
PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
|
||||
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time
|
||||
AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time
|
||||
AVX512 SP [MFLOP/s] 1.0E-06*(PMC3*16.0)/time
|
||||
Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time
|
||||
Scalar [MUOPS/s] 1.0E-06*PMC1/time
|
||||
Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
|
||||
AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
|
||||
AVX512 SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
|
||||
Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime
|
||||
Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
|
||||
Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)
|
||||
-
|
||||
SSE scalar and packed single precision FLOP rates.
|
||||
|
38
collectors/likwid/groups/CLX/L2.txt
Normal file
38
collectors/likwid/groups/CLX/L2.txt
Normal file
@@ -0,0 +1,38 @@
|
||||
SHORT L2 cache bandwidth in MBytes/s
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 L1D_REPLACEMENT
|
||||
PMC1 L1D_M_EVICT
|
||||
PMC2 ICACHE_64B_IFTAG_MISS
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
|
||||
L2D load data volume [GBytes] 1.0E-09*PMC0*64.0
|
||||
L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
|
||||
L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0
|
||||
L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
|
||||
L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
|
||||
L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
|
||||
L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
|
||||
L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
|
||||
L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_64B_IFTAG_MISS)*64/time
|
||||
L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_64B_IFTAG_MISS)*64
|
||||
-
|
||||
Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
|
||||
number of cache line allocated in the L1 and the number of modified cache lines
|
||||
evicted from the L1. The group also output total data volume transferred between
|
||||
L2 and L1. Note that this bandwidth also includes data transfers due to a write
|
||||
allocate load on a store miss in L1 and traffic caused by misses in the
|
||||
L1 instruction cache.
|
||||
|
34
collectors/likwid/groups/CLX/L2CACHE.txt
Normal file
34
collectors/likwid/groups/CLX/L2CACHE.txt
Normal file
@@ -0,0 +1,34 @@
|
||||
SHORT L2 cache miss rate/ratio
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 L2_TRANS_ALL_REQUESTS
|
||||
PMC1 L2_RQSTS_MISS
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
L2 request rate PMC0/FIXC0
|
||||
L2 miss rate PMC1/FIXC0
|
||||
L2 miss ratio PMC1/PMC0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
|
||||
L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
|
||||
L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
|
||||
-
|
||||
This group measures the locality of your data accesses with regard to the
|
||||
L2 cache. L2 request rate tells you how data intensive your code is
|
||||
or how many data accesses you have on average per instruction.
|
||||
The L2 miss rate gives a measure how often it was necessary to get
|
||||
cache lines from memory. And finally L2 miss ratio tells you how many of your
|
||||
memory references required a cache line to be loaded from a higher level.
|
||||
While the# data cache miss rate might be given by your algorithm you should
|
||||
try to get data cache miss ratio as low as possible by increasing your cache reuse.
|
||||
|
||||
|
36
collectors/likwid/groups/CLX/L3.txt
Normal file
36
collectors/likwid/groups/CLX/L3.txt
Normal file
@@ -0,0 +1,36 @@
|
||||
SHORT L3 cache bandwidth in MBytes/s
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 L2_LINES_IN_ALL
|
||||
PMC1 L2_TRANS_L2_WB
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
|
||||
L3 load data volume [GBytes] 1.0E-09*PMC0*64.0
|
||||
L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
|
||||
L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0
|
||||
L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
|
||||
L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
|
||||
L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
|
||||
L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time
|
||||
L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0
|
||||
L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
|
||||
L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
|
||||
-
|
||||
Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
|
||||
number of cache line allocated in the L2 and the number of modified cache lines
|
||||
evicted from the L2. This group also output data volume transferred between the
|
||||
L3 and measured cores L2 caches. Note that this bandwidth also includes data
|
||||
transfers due to a write allocate load on a store miss in L2.
|
||||
|
35
collectors/likwid/groups/CLX/L3CACHE.txt
Normal file
35
collectors/likwid/groups/CLX/L3CACHE.txt
Normal file
@@ -0,0 +1,35 @@
|
||||
SHORT L3 cache miss rate/ratio
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 MEM_LOAD_RETIRED_L3_HIT
|
||||
PMC1 MEM_LOAD_RETIRED_L3_MISS
|
||||
PMC2 UOPS_RETIRED_ALL
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
L3 request rate (PMC0+PMC1)/PMC2
|
||||
L3 miss rate PMC1/PMC2
|
||||
L3 miss ratio PMC1/(PMC0+PMC1)
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
L3 request rate = (MEM_LOAD_RETIRED_L3_HIT+MEM_LOAD_RETIRED_L3_MISS)/UOPS_RETIRED_ALL
|
||||
L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
|
||||
L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/(MEM_LOAD_RETIRED_L3_HIT+MEM_LOAD_RETIRED_L3_MISS)
|
||||
-
|
||||
This group measures the locality of your data accesses with regard to the
|
||||
L3 cache. L3 request rate tells you how data intensive your code is
|
||||
or how many data accesses you have on average per instruction.
|
||||
The L3 miss rate gives a measure how often it was necessary to get
|
||||
cache lines from memory. And finally L3 miss ratio tells you how many of your
|
||||
memory references required a cache line to be loaded from a higher level.
|
||||
While the data cache miss rate might be given by your algorithm you should
|
||||
try to get data cache miss ratio as low as possible by increasing your cache reuse.
|
||||
|
||||
|
48
collectors/likwid/groups/CLX/MEM.txt
Normal file
48
collectors/likwid/groups/CLX/MEM.txt
Normal file
@@ -0,0 +1,48 @@
|
||||
SHORT Main memory bandwidth in MBytes/s
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
MBOX0C0 CAS_COUNT_RD
|
||||
MBOX0C1 CAS_COUNT_WR
|
||||
MBOX1C0 CAS_COUNT_RD
|
||||
MBOX1C1 CAS_COUNT_WR
|
||||
MBOX2C0 CAS_COUNT_RD
|
||||
MBOX2C1 CAS_COUNT_WR
|
||||
MBOX3C0 CAS_COUNT_RD
|
||||
MBOX3C1 CAS_COUNT_WR
|
||||
MBOX4C0 CAS_COUNT_RD
|
||||
MBOX4C1 CAS_COUNT_WR
|
||||
MBOX5C0 CAS_COUNT_RD
|
||||
MBOX5C1 CAS_COUNT_WR
|
||||
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time
|
||||
Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0
|
||||
Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
|
||||
Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
|
||||
Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
|
||||
Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
|
||||
Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
|
||||
Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
|
||||
Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
|
||||
Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
|
||||
Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
|
||||
-
|
||||
Profiling group to measure memory bandwidth drawn by all cores of a socket.
|
||||
Since this group is based on Uncore events it is only possible to measure on a
|
||||
per socket base. Some of the counters may not be available on your system.
|
||||
Also outputs total data volume transferred from main memory.
|
||||
The same metrics are provided by the HA group.
|
||||
|
70
collectors/likwid/groups/CLX/MEM_DP.txt
Normal file
70
collectors/likwid/groups/CLX/MEM_DP.txt
Normal file
@@ -0,0 +1,70 @@
|
||||
SHORT Overview of arithmetic and main memory performance
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PWR0 PWR_PKG_ENERGY
|
||||
PWR3 PWR_DRAM_ENERGY
|
||||
PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
|
||||
PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
|
||||
PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
|
||||
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
|
||||
MBOX0C0 CAS_COUNT_RD
|
||||
MBOX0C1 CAS_COUNT_WR
|
||||
MBOX1C0 CAS_COUNT_RD
|
||||
MBOX1C1 CAS_COUNT_WR
|
||||
MBOX2C0 CAS_COUNT_RD
|
||||
MBOX2C1 CAS_COUNT_WR
|
||||
MBOX3C0 CAS_COUNT_RD
|
||||
MBOX3C1 CAS_COUNT_WR
|
||||
MBOX4C0 CAS_COUNT_RD
|
||||
MBOX4C1 CAS_COUNT_WR
|
||||
MBOX5C0 CAS_COUNT_RD
|
||||
MBOX5C1 CAS_COUNT_WR
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Energy [J] PWR0
|
||||
Power [W] PWR0/time
|
||||
Energy DRAM [J] PWR3
|
||||
Power DRAM [W] PWR3/time
|
||||
DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time
|
||||
AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time
|
||||
Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time
|
||||
Scalar [MUOPS/s] 1.0E-06*PMC1/time
|
||||
Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time
|
||||
Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0
|
||||
Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
|
||||
Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
|
||||
Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
|
||||
Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
|
||||
Operational intensity (PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0)
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Power [W] = PWR_PKG_ENERGY/runtime
|
||||
Power DRAM [W] = PWR_DRAM_ENERGY/runtime
|
||||
DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
|
||||
AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
|
||||
Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime
|
||||
Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
|
||||
Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/runtime
|
||||
Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
|
||||
Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/runtime
|
||||
Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
|
||||
Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/runtime
|
||||
Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
|
||||
Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0)
|
||||
--
|
||||
Profiling group to measure memory bandwidth drawn by all cores of a socket.
|
||||
Since this group is based on Uncore events it is only possible to measure on
|
||||
a per socket base. Also outputs total data volume transferred from main memory.
|
||||
SSE scalar and packed double precision FLOP rates. Also reports on packed AVX
|
||||
32b instructions.
|
||||
The operational intensity is calculated using the FP values of the cores and the
|
||||
memory data volume of the whole socket. The actual operational intensity for
|
||||
multiple CPUs can be found in the statistics table in the Sum column.
|
70
collectors/likwid/groups/CLX/MEM_SP.txt
Normal file
70
collectors/likwid/groups/CLX/MEM_SP.txt
Normal file
@@ -0,0 +1,70 @@
|
||||
SHORT Overview of arithmetic and main memory performance
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PWR0 PWR_PKG_ENERGY
|
||||
PWR3 PWR_DRAM_ENERGY
|
||||
PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
|
||||
PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE
|
||||
PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
|
||||
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
|
||||
MBOX0C0 CAS_COUNT_RD
|
||||
MBOX0C1 CAS_COUNT_WR
|
||||
MBOX1C0 CAS_COUNT_RD
|
||||
MBOX1C1 CAS_COUNT_WR
|
||||
MBOX2C0 CAS_COUNT_RD
|
||||
MBOX2C1 CAS_COUNT_WR
|
||||
MBOX3C0 CAS_COUNT_RD
|
||||
MBOX3C1 CAS_COUNT_WR
|
||||
MBOX4C0 CAS_COUNT_RD
|
||||
MBOX4C1 CAS_COUNT_WR
|
||||
MBOX5C0 CAS_COUNT_RD
|
||||
MBOX5C1 CAS_COUNT_WR
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Energy [J] PWR0
|
||||
Power [W] PWR0/time
|
||||
Energy DRAM [J] PWR3
|
||||
Power DRAM [W] PWR3/time
|
||||
SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time
|
||||
AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time
|
||||
Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time
|
||||
Scalar [MUOPS/s] 1.0E-06*PMC1/time
|
||||
Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time
|
||||
Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0
|
||||
Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
|
||||
Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
|
||||
Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
|
||||
Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
|
||||
Operational intensity (PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0)
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Power [W] = PWR_PKG_ENERGY/runtime
|
||||
Power DRAM [W] = PWR_DRAM_ENERGY/runtime
|
||||
SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
|
||||
AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
|
||||
Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime
|
||||
Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
|
||||
Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/runtime
|
||||
Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
|
||||
Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/runtime
|
||||
Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
|
||||
Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/runtime
|
||||
Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
|
||||
Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0)
|
||||
--
|
||||
Profiling group to measure memory bandwidth drawn by all cores of a socket.
|
||||
Since this group is based on Uncore events it is only possible to measure on
|
||||
a per socket base. Also outputs total data volume transferred from main memory.
|
||||
SSE scalar and packed single precision FLOP rates. Also reports on packed AVX
|
||||
32b instructions.
|
||||
The operational intensity is calculated using the FP values of the cores and the
|
||||
memory data volume of the whole socket. The actual operational intensity for
|
||||
multiple CPUs can be found in the statistics table in the Sum column.
|
46
collectors/likwid/groups/CLX/PMM.txt
Normal file
46
collectors/likwid/groups/CLX/PMM.txt
Normal file
@@ -0,0 +1,46 @@
|
||||
SHORT Intel Optance DC bandwidth in MBytes/s
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
MBOX0C0 PMM_CMD1_RD
|
||||
MBOX0C1 PMM_CMD1_WR
|
||||
MBOX1C0 PMM_CMD1_RD
|
||||
MBOX1C1 PMM_CMD1_WR
|
||||
MBOX2C0 PMM_CMD1_RD
|
||||
MBOX2C1 PMM_CMD1_WR
|
||||
MBOX3C0 PMM_CMD1_RD
|
||||
MBOX3C1 PMM_CMD1_WR
|
||||
MBOX4C0 PMM_CMD1_RD
|
||||
MBOX4C1 PMM_CMD1_WR
|
||||
MBOX5C0 PMM_CMD1_RD
|
||||
MBOX5C1 PMM_CMD1_WR
|
||||
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
PMM read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time
|
||||
PMM read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0
|
||||
PMM write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
|
||||
PMM write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
|
||||
PMM bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
|
||||
PMM data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
PMM read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
|
||||
PMM read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
|
||||
PMM write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
|
||||
PMM write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
|
||||
PMM bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
|
||||
PMM data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
|
||||
-
|
||||
Profiling group to measure data rate and volume for accesses to Intel Optane DC
|
||||
persistent memory. The Intel Optance DC devices are handled by the memory
|
||||
controllers but require different events.
|
||||
|
35
collectors/likwid/groups/CLX/TLB_DATA.txt
Normal file
35
collectors/likwid/groups/CLX/TLB_DATA.txt
Normal file
@@ -0,0 +1,35 @@
|
||||
SHORT L2 data TLB miss rate/ratio
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 DTLB_LOAD_MISSES_CAUSES_A_WALK
|
||||
PMC1 DTLB_STORE_MISSES_CAUSES_A_WALK
|
||||
PMC2 DTLB_LOAD_MISSES_WALK_ACTIVE
|
||||
PMC3 DTLB_STORE_MISSES_WALK_ACTIVE
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
L1 DTLB load misses PMC0
|
||||
L1 DTLB load miss rate PMC0/FIXC0
|
||||
L1 DTLB load miss duration [Cyc] PMC2/PMC0
|
||||
L1 DTLB store misses PMC1
|
||||
L1 DTLB store miss rate PMC1/FIXC0
|
||||
L1 DTLB store miss duration [Cyc] PMC3/PMC1
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
|
||||
L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
|
||||
L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_ACTIVE / DTLB_LOAD_MISSES_CAUSES_A_WALK
|
||||
L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
|
||||
L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
|
||||
L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_ACTIVE / DTLB_STORE_MISSES_CAUSES_A_WALK
|
||||
-
|
||||
The DTLB load and store miss rates gives a measure how often a TLB miss occurred
|
||||
per instruction. The duration measures the time in cycles how long a walk did take.
|
||||
|
28
collectors/likwid/groups/CLX/TLB_INSTR.txt
Normal file
28
collectors/likwid/groups/CLX/TLB_INSTR.txt
Normal file
@@ -0,0 +1,28 @@
|
||||
SHORT L1 Instruction TLB miss rate/ratio
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 ITLB_MISSES_CAUSES_A_WALK
|
||||
PMC1 ITLB_MISSES_WALK_ACTIVE
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
L1 ITLB misses PMC0
|
||||
L1 ITLB miss rate PMC0/FIXC0
|
||||
L1 ITLB miss duration [Cyc] PMC1/PMC0
|
||||
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
|
||||
L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
|
||||
L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_ACTIVE / ITLB_MISSES_CAUSES_A_WALK
|
||||
-
|
||||
The ITLB miss rates gives a measure how often a TLB miss occurred
|
||||
per instruction. The duration measures the time in cycles how long a walk did take.
|
||||
|
48
collectors/likwid/groups/CLX/TMA.txt
Normal file
48
collectors/likwid/groups/CLX/TMA.txt
Normal file
@@ -0,0 +1,48 @@
|
||||
SHORT Top down cycle allocation
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 UOPS_ISSUED_ANY
|
||||
PMC1 UOPS_RETIRED_RETIRE_SLOTS
|
||||
PMC2 IDQ_UOPS_NOT_DELIVERED_CORE
|
||||
PMC3 INT_MISC_RECOVERY_CYCLES
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
IPC FIXC0/FIXC1
|
||||
Total Slots 4*FIXC1
|
||||
Slots Retired PMC1
|
||||
Fetch Bubbles PMC2
|
||||
Recovery Bubbles 4*PMC3
|
||||
Front End [%] PMC2/(4*FIXC1)*100
|
||||
Speculation [%] (PMC0-PMC1+(4*PMC3))/(4*FIXC1)*100
|
||||
Retiring [%] PMC1/(4*FIXC1)*100
|
||||
Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Total Slots = 4*CPU_CLK_UNHALTED_CORE
|
||||
Slots Retired = UOPS_RETIRED_RETIRE_SLOTS
|
||||
Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE
|
||||
Recovery Bubbles = 4*INT_MISC_RECOVERY_CYCLES
|
||||
Front End [%] = IDQ_UOPS_NOT_DELIVERED_CORE/(4*CPU_CLK_UNHALTED_CORE)*100
|
||||
Speculation [%] = (UOPS_ISSUED_ANY-UOPS_RETIRED_RETIRE_SLOTS+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)*100
|
||||
Retiring [%] = UOPS_RETIRED_RETIRE_SLOTS/(4*CPU_CLK_UNHALTED_CORE)*100
|
||||
Back End [%] = (1-((IDQ_UOPS_NOT_DELIVERED_CORE+UOPS_ISSUED_ANY+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)))*100
|
||||
--
|
||||
This performance group measures cycles to determine percentage of time spent in
|
||||
front end, back end, retiring and speculation. These metrics are published and
|
||||
verified by Intel. Further information:
|
||||
Webpage describing Top-Down Method and its usage in Intel vTune:
|
||||
https://software.intel.com/en-us/vtune-amplifier-help-tuning-applications-using-a-top-down-microarchitecture-analysis-method
|
||||
Paper by Yasin Ahmad:
|
||||
https://sites.google.com/site/analysismethods/yasin-pubs/TopDown-Yasin-ISPASS14.pdf?attredirects=0
|
||||
Slides by Yasin Ahmad:
|
||||
http://www.cs.technion.ac.il/~erangi/TMA_using_Linux_perf__Ahmad_Yasin.pdf
|
||||
The performance group was originally published here:
|
||||
http://perf.mvermeulen.com/2018/04/14/top-down-performance-counter-analysis-part-1-likwid/
|
31
collectors/likwid/groups/CLX/UOPS_EXEC.txt
Normal file
31
collectors/likwid/groups/CLX/UOPS_EXEC.txt
Normal file
@@ -0,0 +1,31 @@
|
||||
SHORT UOPs execution
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 UOPS_EXECUTED_USED_CYCLES
|
||||
PMC1 UOPS_EXECUTED_STALL_CYCLES
|
||||
PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES
|
||||
PMC3:EDGEDETECT UOPS_EXECUTED_STALL_CYCLES
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Used cycles ratio [%] 100*PMC0/PMC2
|
||||
Unused cycles ratio [%] 100*PMC1/PMC2
|
||||
Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
|
||||
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
|
||||
Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
|
||||
Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT
|
||||
-
|
||||
This performance group returns the ratios of used and unused cycles regarding
|
||||
the execution stage in the pipeline. Used cycles are all cycles where uOPs are
|
||||
executed while unused cycles refer to pipeline stalls. Moreover, the group
|
||||
calculates the average stall duration in cycles.
|
31
collectors/likwid/groups/CLX/UOPS_ISSUE.txt
Normal file
31
collectors/likwid/groups/CLX/UOPS_ISSUE.txt
Normal file
@@ -0,0 +1,31 @@
|
||||
SHORT UOPs issueing
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 UOPS_ISSUED_USED_CYCLES
|
||||
PMC1 UOPS_ISSUED_STALL_CYCLES
|
||||
PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES
|
||||
PMC3:EDGEDETECT UOPS_ISSUED_STALL_CYCLES
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Used cycles ratio [%] 100*PMC0/PMC2
|
||||
Unused cycles ratio [%] 100*PMC1/PMC2
|
||||
Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
|
||||
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
|
||||
Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
|
||||
Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT
|
||||
-
|
||||
This performance group returns the ratios of used and unused cycles regarding
|
||||
the issue stage in the pipeline. Used cycles are all cycles where uOPs are
|
||||
issued while unused cycles refer to pipeline stalls. Moreover, the group
|
||||
calculates the average stall duration in cycles.
|
31
collectors/likwid/groups/CLX/UOPS_RETIRE.txt
Normal file
31
collectors/likwid/groups/CLX/UOPS_RETIRE.txt
Normal file
@@ -0,0 +1,31 @@
|
||||
SHORT UOPs retirement
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 UOPS_RETIRED_USED_CYCLES
|
||||
PMC1 UOPS_RETIRED_STALL_CYCLES
|
||||
PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES
|
||||
PMC3:EDGEDETECT UOPS_RETIRED_STALL_CYCLES
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Used cycles ratio [%] 100*PMC0/PMC2
|
||||
Unused cycles ratio [%] 100*PMC1/PMC2
|
||||
Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
|
||||
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
|
||||
Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
|
||||
Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT
|
||||
-
|
||||
This performance group returns the ratios of used and unused cycles regarding
|
||||
the retirement stage in the pipeline (re-order buffer). Used cycles are all
|
||||
cycles where uOPs are retired while unused cycles refer to pipeline stalls.
|
||||
Moreover, the group calculates the average stall duration in cycles.
|
42
collectors/likwid/groups/CLX/UPI.txt
Normal file
42
collectors/likwid/groups/CLX/UPI.txt
Normal file
@@ -0,0 +1,42 @@
|
||||
SHORT UPI data traffic
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
SBOX0C0 TXL_FLITS_ALL_DATA
|
||||
SBOX0C1 RXL_FLITS_ALL_DATA
|
||||
SBOX1C0 TXL_FLITS_ALL_DATA
|
||||
SBOX1C1 RXL_FLITS_ALL_DATA
|
||||
SBOX2C0 TXL_FLITS_ALL_DATA
|
||||
SBOX2C1 RXL_FLITS_ALL_DATA
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Received data bandwidth [MByte/s] 1.0E-06*((SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0/time
|
||||
Received data volume [GByte] 1.0E-09*((SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0
|
||||
Sent data bandwidth [MByte/s] 1.0E-06*((SBOX0C0+SBOX1C0+SBOX2C0)/9.0)*64.0/time
|
||||
Sent data volume [GByte] 1.0E-09*((SBOX0C0+SBOX1C0+SBOX2C0)/9.0)*64.0
|
||||
Total data bandwidth [MByte/s] 1.0E-06*((SBOX0C0+SBOX1C0+SBOX2C0+SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0/time
|
||||
Total data volume [GByte] 1.0E-09*((SBOX0C0+SBOX1C0+SBOX2C0+SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0
|
||||
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Received data bandwidth [MByte/s] = 1.0E-06*(SUM(RXL_FLITS_ALL_DATA)/9.0)*64.0/runtime
|
||||
Received data volume [GByte] = 1.0E-09*(SUM(RXL_FLITS_ALL_DATA)/9.0)*64.0
|
||||
Sent data bandwidth [MByte/s] = 1.0E-06*(SUM(TXL_FLITS_ALL_DATA)/9.0)*64.0/time
|
||||
Sent data volume [GByte] = 1.0E-09*(SUM(TXL_FLITS_ALL_DATA)/9.0)*64.0
|
||||
Total data bandwidth [MByte/s] = 1.0E-06*((SUM(RXL_FLITS_ALL_DATA)+SUM(TXL_FLITS_ALL_DATA))/9.0)*64.0/time
|
||||
Total data volume [GByte] = 1.0E-09*((SUM(RXL_FLITS_ALL_DATA)+SUM(TXL_FLITS_ALL_DATA))/9.0)*64.0
|
||||
--
|
||||
This group measures the data traffic on the UPI (socket interconnect). The group
|
||||
measures all filled data slots (9 slots per 64 Byte data transfer), that's why
|
||||
the count needs to be divided by 9. These 9 data chunks are not transferred in
|
||||
a single flit but there is one flit for the header and three flits for the data.
|
||||
The metrics show higher values as expected because the events count also
|
||||
different transfers which include data.
|
31
collectors/likwid/groups/ICL/BRANCH.txt
Normal file
31
collectors/likwid/groups/ICL/BRANCH.txt
Normal file
@@ -0,0 +1,31 @@
|
||||
SHORT Branch prediction miss rate/ratio
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 BR_INST_RETIRED_ALL_BRANCHES
|
||||
PMC1 BR_MISP_RETIRED_ALL_BRANCHES
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Branch rate PMC0/FIXC0
|
||||
Branch misprediction rate PMC1/FIXC0
|
||||
Branch misprediction ratio PMC1/PMC0
|
||||
Instructions per branch FIXC0/PMC0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
|
||||
Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
|
||||
Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
|
||||
Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
|
||||
-
|
||||
The rates state how often on average a branch or a mispredicted branch occurred
|
||||
per instruction retired in total. The branch misprediction ratio sets directly
|
||||
into relation what ratio of all branch instruction where mispredicted.
|
||||
Instructions per branch is 1/branch rate.
|
||||
|
22
collectors/likwid/groups/ICL/DATA.txt
Normal file
22
collectors/likwid/groups/ICL/DATA.txt
Normal file
@@ -0,0 +1,22 @@
|
||||
SHORT Load to store ratio
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 MEM_INST_RETIRED_ALL_LOADS
|
||||
PMC1 MEM_INST_RETIRED_ALL_STORES
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Load to store ratio PMC0/PMC1
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Load to store ratio = MEM_INST_RETIRED_ALL_LOADS/MEM_INST_RETIRED_ALL_STORES
|
||||
-
|
||||
This is a metric to determine your load to store ratio.
|
||||
|
24
collectors/likwid/groups/ICL/DIVIDE.txt
Normal file
24
collectors/likwid/groups/ICL/DIVIDE.txt
Normal file
@@ -0,0 +1,24 @@
|
||||
SHORT Divide unit information
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 ARITH_DIVIDER_COUNT
|
||||
PMC1 ARITH_DIVIDER_ACTIVE
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Number of divide ops PMC0
|
||||
Avg. divide unit usage duration PMC1/PMC0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Number of divide ops = ARITH_DIVIDER_COUNT
|
||||
Avg. divide unit usage duration = ARITH_DIVIDER_ACTIVE/ARITH_DIVIDER_COUNT
|
||||
-
|
||||
This performance group measures the average latency of divide operations
|
35
collectors/likwid/groups/ICL/ENERGY.txt
Normal file
35
collectors/likwid/groups/ICL/ENERGY.txt
Normal file
@@ -0,0 +1,35 @@
|
||||
SHORT Power and Energy consumption
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
TMP0 TEMP_CORE
|
||||
PWR0 PWR_PKG_ENERGY
|
||||
PWR1 PWR_PP0_ENERGY
|
||||
PWR3 PWR_DRAM_ENERGY
|
||||
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Temperature [C] TMP0
|
||||
Energy [J] PWR0
|
||||
Power [W] PWR0/time
|
||||
Energy PP0 [J] PWR1
|
||||
Power PP0 [W] PWR1/time
|
||||
Energy DRAM [J] PWR3
|
||||
Power DRAM [W] PWR3/time
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Power = PWR_PKG_ENERGY / time
|
||||
Power PP0 = PWR_PP0_ENERGY / time
|
||||
Power DRAM = PWR_DRAM_ENERGY / time
|
||||
-
|
||||
Broadwell implements the new RAPL interface. This interface enables to
|
||||
monitor the consumed energy on the package (socket) and DRAM level.
|
||||
|
25
collectors/likwid/groups/ICL/FLOPS_AVX.txt
Normal file
25
collectors/likwid/groups/ICL/FLOPS_AVX.txt
Normal file
@@ -0,0 +1,25 @@
|
||||
SHORT Packed AVX MFLOP/s
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
|
||||
PMC1 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
|
||||
PMC2 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
|
||||
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Packed SP [MFLOP/s] 1.0E-06*(PMC0*8.0+PMC2*16.0)/time
|
||||
Packed DP [MFLOP/s] 1.0E-06*(PMC1*4.0+PMC3*8.0)/time
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
|
||||
Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
|
||||
-
|
||||
Packed 32b AVX FLOPs rates.
|
34
collectors/likwid/groups/ICL/FLOPS_DP.txt
Normal file
34
collectors/likwid/groups/ICL/FLOPS_DP.txt
Normal file
@@ -0,0 +1,34 @@
|
||||
SHORT Double Precision MFLOP/s
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
|
||||
PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
|
||||
PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
|
||||
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time
|
||||
AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time
|
||||
AVX512 DP [MFLOP/s] 1.0E-06*(PMC3*8.0)/time
|
||||
Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time
|
||||
Scalar [MUOPS/s] 1.0E-06*PMC1/time
|
||||
Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
|
||||
AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
|
||||
AVX512 DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
|
||||
Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime
|
||||
Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
|
||||
Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)
|
||||
-
|
||||
SSE scalar and packed double precision FLOP rates.
|
||||
|
34
collectors/likwid/groups/ICL/FLOPS_SP.txt
Normal file
34
collectors/likwid/groups/ICL/FLOPS_SP.txt
Normal file
@@ -0,0 +1,34 @@
|
||||
SHORT Single Precision MFLOP/s
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
|
||||
PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE
|
||||
PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
|
||||
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time
|
||||
AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time
|
||||
AVX512 SP [MFLOP/s] 1.0E-06*(PMC3*16.0)/time
|
||||
Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time
|
||||
Scalar [MUOPS/s] 1.0E-06*PMC1/time
|
||||
Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
|
||||
AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
|
||||
AVX512 SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
|
||||
Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime
|
||||
Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
|
||||
Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)
|
||||
-
|
||||
SSE scalar and packed single precision FLOP rates.
|
||||
|
32
collectors/likwid/groups/ICX/BRANCH.txt
Normal file
32
collectors/likwid/groups/ICX/BRANCH.txt
Normal file
@@ -0,0 +1,32 @@
|
||||
SHORT Branch prediction miss rate/ratio
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
FIXC3 TOPDOWN_SLOTS
|
||||
PMC0 BR_INST_RETIRED_ALL_BRANCHES
|
||||
PMC1 BR_MISP_RETIRED_ALL_BRANCHES
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Branch rate PMC0/FIXC0
|
||||
Branch misprediction rate PMC1/FIXC0
|
||||
Branch misprediction ratio PMC1/PMC0
|
||||
Instructions per branch FIXC0/PMC0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
|
||||
Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
|
||||
Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
|
||||
Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
|
||||
-
|
||||
The rates state how often on average a branch or a mispredicted branch occurred
|
||||
per instruction retired in total. The branch misprediction ratio sets directly
|
||||
into relation what ratio of all branch instruction where mispredicted.
|
||||
Instructions per branch is 1/branch rate.
|
||||
|
23
collectors/likwid/groups/ICX/DATA.txt
Normal file
23
collectors/likwid/groups/ICX/DATA.txt
Normal file
@@ -0,0 +1,23 @@
|
||||
SHORT Load to store ratio
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
FIXC3 TOPDOWN_SLOTS
|
||||
PMC0 MEM_INST_RETIRED_ALL_LOADS
|
||||
PMC1 MEM_INST_RETIRED_ALL_STORES
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Load to store ratio PMC0/PMC1
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Load to store ratio = MEM_INST_RETIRED_ALL_LOADS/MEM_INST_RETIRED_ALL_STORES
|
||||
-
|
||||
This is a metric to determine your load to store ratio.
|
||||
|
25
collectors/likwid/groups/ICX/DIVIDE.txt
Normal file
25
collectors/likwid/groups/ICX/DIVIDE.txt
Normal file
@@ -0,0 +1,25 @@
|
||||
SHORT Divide unit information
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
FIXC3 TOPDOWN_SLOTS
|
||||
PMC0 ARITH_DIVIDER_COUNT
|
||||
PMC1 ARITH_DIVIDER_ACTIVE
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Number of divide ops PMC0
|
||||
Avg. divide unit usage duration PMC1/PMC0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Number of divide ops = ARITH_DIVIDER_COUNT
|
||||
Avg. divide unit usage duration = ARITH_DIVIDER_ACTIVE/ARITH_DIVIDER_COUNT
|
||||
-
|
||||
This performance group measures the average latency of divide operations
|
26
collectors/likwid/groups/ICX/FLOPS_AVX.txt
Normal file
26
collectors/likwid/groups/ICX/FLOPS_AVX.txt
Normal file
@@ -0,0 +1,26 @@
|
||||
SHORT Packed AVX MFLOP/s
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
FIXC3 TOPDOWN_SLOTS
|
||||
PMC0 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
|
||||
PMC1 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
|
||||
PMC2 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
|
||||
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Packed SP [MFLOP/s] 1.0E-06*(PMC0*8.0+PMC2*16.0)/time
|
||||
Packed DP [MFLOP/s] 1.0E-06*(PMC1*4.0+PMC3*8.0)/time
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
|
||||
Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
|
||||
-
|
||||
Packed 32b AVX FLOPs rates.
|
35
collectors/likwid/groups/ICX/FLOPS_DP.txt
Normal file
35
collectors/likwid/groups/ICX/FLOPS_DP.txt
Normal file
@@ -0,0 +1,35 @@
|
||||
SHORT Double Precision MFLOP/s
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
FIXC3 TOPDOWN_SLOTS
|
||||
PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
|
||||
PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
|
||||
PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
|
||||
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time
|
||||
AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time
|
||||
AVX512 DP [MFLOP/s] 1.0E-06*(PMC3*8.0)/time
|
||||
Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time
|
||||
Scalar [MUOPS/s] 1.0E-06*PMC1/time
|
||||
Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
|
||||
AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
|
||||
AVX512 DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
|
||||
Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime
|
||||
Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
|
||||
Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)
|
||||
-
|
||||
SSE scalar and packed double precision FLOP rates.
|
||||
|
35
collectors/likwid/groups/ICX/FLOPS_SP.txt
Normal file
35
collectors/likwid/groups/ICX/FLOPS_SP.txt
Normal file
@@ -0,0 +1,35 @@
|
||||
SHORT Single Precision MFLOP/s
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
FIXC3 TOPDOWN_SLOTS
|
||||
PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
|
||||
PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE
|
||||
PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
|
||||
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time
|
||||
AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time
|
||||
AVX512 SP [MFLOP/s] 1.0E-06*(PMC3*16.0)/time
|
||||
Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time
|
||||
Scalar [MUOPS/s] 1.0E-06*PMC1/time
|
||||
Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
|
||||
AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
|
||||
AVX512 SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
|
||||
Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime
|
||||
Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
|
||||
Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)
|
||||
-
|
||||
SSE scalar and packed single precision FLOP rates.
|
||||
|
39
collectors/likwid/groups/ICX/L2.txt
Normal file
39
collectors/likwid/groups/ICX/L2.txt
Normal file
@@ -0,0 +1,39 @@
|
||||
SHORT L2 cache bandwidth in MBytes/s
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
FIXC3 TOPDOWN_SLOTS
|
||||
PMC0 L1D_REPLACEMENT
|
||||
PMC1 L2_TRANS_L1D_WB
|
||||
PMC2 ICACHE_64B_IFTAG_MISS
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
|
||||
L2D load data volume [GBytes] 1.0E-09*PMC0*64.0
|
||||
L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
|
||||
L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0
|
||||
L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
|
||||
L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
|
||||
L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
|
||||
L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time
|
||||
L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0
|
||||
L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_64B_IFTAG_MISS)*64/time
|
||||
L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_64B_IFTAG_MISS)*64
|
||||
-
|
||||
Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
|
||||
number of cache line allocated in the L1 and the number of modified cache lines
|
||||
evicted from the L1. The group also output total data volume transferred between
|
||||
L2 and L1. Note that this bandwidth also includes data transfers due to a write
|
||||
allocate load on a store miss in L1 and traffic caused by misses in the
|
||||
L1 instruction cache.
|
||||
|
31
collectors/likwid/groups/TGL/BRANCH.txt
Normal file
31
collectors/likwid/groups/TGL/BRANCH.txt
Normal file
@@ -0,0 +1,31 @@
|
||||
SHORT Branch prediction miss rate/ratio
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 BR_INST_RETIRED_ALL_BRANCHES
|
||||
PMC1 BR_MISP_RETIRED_ALL_BRANCHES
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Branch rate PMC0/FIXC0
|
||||
Branch misprediction rate PMC1/FIXC0
|
||||
Branch misprediction ratio PMC1/PMC0
|
||||
Instructions per branch FIXC0/PMC0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
|
||||
Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
|
||||
Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
|
||||
Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
|
||||
-
|
||||
The rates state how often on average a branch or a mispredicted branch occurred
|
||||
per instruction retired in total. The branch misprediction ratio sets directly
|
||||
into relation what ratio of all branch instruction where mispredicted.
|
||||
Instructions per branch is 1/branch rate.
|
||||
|
22
collectors/likwid/groups/TGL/DATA.txt
Normal file
22
collectors/likwid/groups/TGL/DATA.txt
Normal file
@@ -0,0 +1,22 @@
|
||||
SHORT Load to store ratio
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 MEM_INST_RETIRED_ALL_LOADS
|
||||
PMC1 MEM_INST_RETIRED_ALL_STORES
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Load to store ratio PMC0/PMC1
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Load to store ratio = MEM_INST_RETIRED_ALL_LOADS/MEM_INST_RETIRED_ALL_STORES
|
||||
-
|
||||
This is a metric to determine your load to store ratio.
|
||||
|
24
collectors/likwid/groups/TGL/DIVIDE.txt
Normal file
24
collectors/likwid/groups/TGL/DIVIDE.txt
Normal file
@@ -0,0 +1,24 @@
|
||||
SHORT Divide unit information
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 ARITH_DIVIDER_COUNT
|
||||
PMC1 ARITH_DIVIDER_ACTIVE
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Number of divide ops PMC0
|
||||
Avg. divide unit usage duration PMC1/PMC0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Number of divide ops = ARITH_DIVIDER_COUNT
|
||||
Avg. divide unit usage duration = ARITH_DIVIDER_ACTIVE/ARITH_DIVIDER_COUNT
|
||||
-
|
||||
This performance group measures the average latency of divide operations
|
35
collectors/likwid/groups/TGL/ENERGY.txt
Normal file
35
collectors/likwid/groups/TGL/ENERGY.txt
Normal file
@@ -0,0 +1,35 @@
|
||||
SHORT Power and Energy consumption
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
TMP0 TEMP_CORE
|
||||
PWR0 PWR_PKG_ENERGY
|
||||
PWR1 PWR_PP0_ENERGY
|
||||
PWR3 PWR_DRAM_ENERGY
|
||||
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Temperature [C] TMP0
|
||||
Energy [J] PWR0
|
||||
Power [W] PWR0/time
|
||||
Energy PP0 [J] PWR1
|
||||
Power PP0 [W] PWR1/time
|
||||
Energy DRAM [J] PWR3
|
||||
Power DRAM [W] PWR3/time
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Power = PWR_PKG_ENERGY / time
|
||||
Power PP0 = PWR_PP0_ENERGY / time
|
||||
Power DRAM = PWR_DRAM_ENERGY / time
|
||||
-
|
||||
Broadwell implements the new RAPL interface. This interface enables to
|
||||
monitor the consumed energy on the package (socket) and DRAM level.
|
||||
|
25
collectors/likwid/groups/TGL/FLOPS_AVX.txt
Normal file
25
collectors/likwid/groups/TGL/FLOPS_AVX.txt
Normal file
@@ -0,0 +1,25 @@
|
||||
SHORT Packed AVX MFLOP/s
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
|
||||
PMC1 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
|
||||
PMC2 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
|
||||
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Packed SP [MFLOP/s] 1.0E-06*(PMC0*8.0+PMC2*16.0)/time
|
||||
Packed DP [MFLOP/s] 1.0E-06*(PMC1*4.0+PMC3*8.0)/time
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
|
||||
Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
|
||||
-
|
||||
Packed 32b AVX FLOPs rates.
|
34
collectors/likwid/groups/TGL/FLOPS_DP.txt
Normal file
34
collectors/likwid/groups/TGL/FLOPS_DP.txt
Normal file
@@ -0,0 +1,34 @@
|
||||
SHORT Double Precision MFLOP/s
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
|
||||
PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
|
||||
PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
|
||||
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time
|
||||
AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time
|
||||
AVX512 DP [MFLOP/s] 1.0E-06*(PMC3*8.0)/time
|
||||
Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time
|
||||
Scalar [MUOPS/s] 1.0E-06*PMC1/time
|
||||
Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
|
||||
AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
|
||||
AVX512 DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
|
||||
Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime
|
||||
Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
|
||||
Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)
|
||||
-
|
||||
SSE scalar and packed double precision FLOP rates.
|
||||
|
34
collectors/likwid/groups/TGL/FLOPS_SP.txt
Normal file
34
collectors/likwid/groups/TGL/FLOPS_SP.txt
Normal file
@@ -0,0 +1,34 @@
|
||||
SHORT Single Precision MFLOP/s
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
|
||||
PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE
|
||||
PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
|
||||
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time
|
||||
AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time
|
||||
AVX512 SP [MFLOP/s] 1.0E-06*(PMC3*16.0)/time
|
||||
Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time
|
||||
Scalar [MUOPS/s] 1.0E-06*PMC1/time
|
||||
Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
|
||||
AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
|
||||
AVX512 SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
|
||||
Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime
|
||||
Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
|
||||
Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)
|
||||
-
|
||||
SSE scalar and packed single precision FLOP rates.
|
||||
|
30
collectors/likwid/groups/arm64fx/BRANCH.txt
Normal file
30
collectors/likwid/groups/arm64fx/BRANCH.txt
Normal file
@@ -0,0 +1,30 @@
|
||||
SHORT Branch prediction miss rate/ratio
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 BR_PRED
|
||||
PMC3 BR_MIS_PRED
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
CPI PMC1/PMC0
|
||||
Branch rate PMC2/PMC0
|
||||
Branch misprediction rate PMC3/PMC0
|
||||
Branch misprediction ratio PMC3/(PMC2+PMC3)
|
||||
Instructions per branch PMC0/(PMC2+PMC3)
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
CPI = CPU_CYCLES/INST_RETIRED
|
||||
Branch rate = BR_PRED/INST_RETIRED
|
||||
Branch misprediction rate = BR_MIS_PRED/INST_RETIRED
|
||||
Branch misprediction ratio = BR_MIS_PRED/(BR_PRED+BR_MIS_PRED)
|
||||
Instructions per branch = INSTR_RETIRED_ANY/(BR_PRED+BR_MIS_PRED)
|
||||
-
|
||||
The rates state how often in average a branch or a mispredicted branch occured
|
||||
per instruction retired in total. The Branch misprediction ratio sets directly
|
||||
into relation what ratio of all branch instruction where mispredicted.
|
||||
Instructions per branch is 1/Branch rate.
|
||||
|
24
collectors/likwid/groups/arm64fx/DATA.txt
Normal file
24
collectors/likwid/groups/arm64fx/DATA.txt
Normal file
@@ -0,0 +1,24 @@
|
||||
SHORT Load to store ratio
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_SPEC
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 LD_SPEC
|
||||
PMC3 ST_SPEC
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
CPI PMC1/PMC0
|
||||
Load to store ratio PMC2/PMC3
|
||||
Load ratio PMC2/PMC0
|
||||
Store ratio PMC3/PMC0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
CPI = CPU_CYCLES/INST_SPEC
|
||||
Load to store ratio = LD_SPEC / ST_SPEC
|
||||
Load ratio = LD_SPEC / INST_SPEC
|
||||
Store ratio = ST_SPEC / INST_SPEC
|
||||
-
|
||||
This is a metric to determine your load to store ratio.
|
||||
|
26
collectors/likwid/groups/arm64fx/FLOPS_DP.txt
Normal file
26
collectors/likwid/groups/arm64fx/FLOPS_DP.txt
Normal file
@@ -0,0 +1,26 @@
|
||||
SHORT Double Precision MFLOP/s
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC3 FP_DP_FIXED_OPS_SPEC
|
||||
PMC4 FP_DP_SCALE_OPS_SPEC
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Clock [MHz] 1.E-06*PMC1/time
|
||||
CPI PMC1/PMC0
|
||||
DP (FP) [MFLOP/s] 1E-06*(PMC3)/time
|
||||
DP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC4*128)/128)+PMC3)/time
|
||||
DP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC4*256)/128)+PMC3)/time
|
||||
DP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC4*512)/128)+PMC3)/time
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
DP (FP) [MFLOP/s] = 1E-06*FP_DP_FIXED_OPS_SPEC/time
|
||||
DP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*128)/128))/time
|
||||
DP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*256)/128))/time
|
||||
DP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*512)/128))/time
|
||||
-
|
||||
Double-precision FP rate for scalar and SVE vector operations with different widths. The events for
|
||||
the SVE metrics assumes that all vector elements are active.
|
26
collectors/likwid/groups/arm64fx/FLOPS_HP.txt
Normal file
26
collectors/likwid/groups/arm64fx/FLOPS_HP.txt
Normal file
@@ -0,0 +1,26 @@
|
||||
SHORT Half-Precision MFLOP/s
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC3 FP_HP_FIXED_OPS_SPEC
|
||||
PMC4 FP_HP_SCALE_OPS_SPEC
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Clock [MHz] 1.E-06*PMC1/time
|
||||
CPI PMC1/PMC0
|
||||
HP (FP) [MFLOP/s] 1E-06*(PMC3)/time
|
||||
HP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC4*128)/128)+PMC3)/time
|
||||
HP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC4*256)/128)+PMC3)/time
|
||||
HP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC4*512)/128)+PMC3)/time
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
HP (FP) [MFLOP/s] = 1E-06*FP_HP_FIXED_OPS_SPEC/time
|
||||
HP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_SPEC+((FP_HP_SCALE_OPS_SPEC*128)/128))/time
|
||||
HP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_SPEC+((FP_HP_SCALE_OPS_SPEC*256)/128))/time
|
||||
HP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_SPEC+((FP_HP_SCALE_OPS_SPEC*512)/128))/time
|
||||
-
|
||||
Half-precision FP rate for scalar and SVE vector operations with different widths. The events for
|
||||
the SVE metrics assumes that all vector elements are active.
|
26
collectors/likwid/groups/arm64fx/FLOPS_SP.txt
Normal file
26
collectors/likwid/groups/arm64fx/FLOPS_SP.txt
Normal file
@@ -0,0 +1,26 @@
|
||||
SHORT Single Precision MFLOP/s
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC3 FP_SP_FIXED_OPS_SPEC
|
||||
PMC4 FP_SP_SCALE_OPS_SPEC
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Clock [MHz] 1.E-06*PMC1/time
|
||||
CPI PMC1/PMC0
|
||||
SP (FP) [MFLOP/s] 1E-06*(PMC3)/time
|
||||
SP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC4*128)/128)+PMC3)/time
|
||||
SP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC4*256)/128)+PMC3)/time
|
||||
SP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC4*512)/128)+PMC3)/time
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
SP (FP) [MFLOP/s] = 1E-06*FP_SP_FIXED_OPS_SPEC/time
|
||||
SP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*128)/128))/time
|
||||
SP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*256)/128))/time
|
||||
SP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*512)/128))/time
|
||||
-
|
||||
Single-precision FP rate for scalar and SVE vector operations with different widths. The events for
|
||||
the SVE metrics assumes that all vector elements are active.
|
33
collectors/likwid/groups/arm64fx/FP_PIPE.txt
Normal file
33
collectors/likwid/groups/arm64fx/FP_PIPE.txt
Normal file
@@ -0,0 +1,33 @@
|
||||
SHORT Utilization of FP pipelines
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 FLA_VAL
|
||||
PMC3 FLA_VAL_PRD_CNT
|
||||
PMC4 FLB_VAL
|
||||
PMC5 FLB_VAL_PRD_CNT
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
CPI PMC1/PMC0
|
||||
FP operation pipeline A busy rate [%] (PMC2/PMC1)*100.0
|
||||
FP pipeline A active element rate [%] (PMC3/(PMC2*16))*100.0
|
||||
FP operation pipeline B busy rate [%] (PMC4/PMC1)*100.0
|
||||
FP pipeline B active element rate [%] (PMC5/(PMC4*16))*100.0
|
||||
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
CPI = CPU_CYCLES/INST_SPEC
|
||||
FP operation pipeline A busy rate [%] = (FLA_VAL/CPU_CYCLES)*100.0
|
||||
FP pipeline A active element rate [%] = (FLA_VAL_PRD_CNT/(FLA_VAL*16))*100.0
|
||||
FP operation pipeline B busy rate [%] = (FLB_VAL/CPU_CYCLES)*100.0
|
||||
FP pipeline B active element rate [%] = (FLB_VAL_PRD_CNT/(FLB_VAL*16))*100.0
|
||||
-
|
||||
FLx_VAL: This event counts valid cycles of FLx pipeline.
|
||||
FLx_VAL_PRD_CNT: This event counts the number of 1's in the predicate bits of
|
||||
request in FLA pipeline, where it is corrected so that it
|
||||
becomes 16 when all bits are 1.
|
||||
So each predicate mask has 16 slots, so there are 16 slots per cycle in FLA and
|
||||
FLB. FLA is partly used by other instructions like SVE stores.
|
24
collectors/likwid/groups/arm64fx/ICACHE.txt
Normal file
24
collectors/likwid/groups/arm64fx/ICACHE.txt
Normal file
@@ -0,0 +1,24 @@
|
||||
SHORT Instruction cache miss rate/ratio
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 L1I_CACHE
|
||||
PMC3 L1I_CACHE_REFILL
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
CPI PMC1/PMC0
|
||||
L1I request rate PMC2/PMC0
|
||||
L1I miss rate PMC3/PMC0
|
||||
L1I miss ratio PMC3/PMC2
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
CPI = CPU_CYCLES/INST_RETIRED
|
||||
L1I request rate = L1I_CACHE / INST_RETIRED
|
||||
L1I miss rate = L1I_CACHE_REFILL / INST_RETIRED
|
||||
L1I miss ratio = L1I_CACHE_REFILL / L1I_CACHE
|
||||
-
|
||||
This group measures some L1 instruction cache metrics.
|
40
collectors/likwid/groups/arm64fx/L2.txt
Normal file
40
collectors/likwid/groups/arm64fx/L2.txt
Normal file
@@ -0,0 +1,40 @@
|
||||
SHORT L2 cache bandwidth in MBytes/s
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 L1D_CACHE_REFILL
|
||||
PMC3 L1D_CACHE_WB
|
||||
PMC4 L1I_CACHE_REFILL
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
CPI PMC1/PMC0
|
||||
L1D<-L2 load bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time
|
||||
L1D<-L2 load data volume [GBytes] 1.0E-09*(PMC2)*256.0
|
||||
L1D->L2 evict bandwidth [MBytes/s] 1.0E-06*PMC3*256.0/time
|
||||
L1D->L2 evict data volume [GBytes] 1.0E-09*PMC3*256.0
|
||||
L1I<-L2 load bandwidth [MBytes/s] 1.0E-06*PMC4*256.0/time
|
||||
L1I<-L2 load data volume [GBytes] 1.0E-09*PMC4*256.0
|
||||
L1<->L2 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3+PMC4)*256.0/time
|
||||
L1<->L2 data volume [GBytes] 1.0E-09*(PMC2+PMC3+PMC4)*256.0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
CPI = CPU_CYCLES/INST_RETIRED
|
||||
L1D<-L2 load bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_REFILL*256.0/time
|
||||
L1D<-L2 load data volume [GBytes] = 1.0E-09*L1D_CACHE_REFILL*256.0
|
||||
L1D->L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_WB*256.0/time
|
||||
L1D->L2 evict data volume [GBytes] = 1.0E-09*L1D_CACHE_WB*256.0
|
||||
L1I<-L2 load bandwidth [MBytes/s] = 1.0E-06*L1I_CACHE_REFILL*256.0/time
|
||||
L1I<-L2 load data volume [GBytes] = 1.0E-09*L1I_CACHE_REFILL*256.0
|
||||
L1<->L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*256.0/time
|
||||
L1<->L2 data volume [GBytes] = 1.0E-09*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*256.0
|
||||
-
|
||||
Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
|
||||
number of cacheline loaded from the L2 to the L1 data cache and the writebacks from
|
||||
the L1 data cache to the L2 cache. The group also outputs total data volume transfered between
|
||||
L2 and L1. Note that this bandwidth also includes data transfers due to a write
|
||||
allocate load on a store miss in L1 and cachelines transfered in the L1 instruction
|
||||
cache.
|
29
collectors/likwid/groups/arm64fx/MEM.txt
Normal file
29
collectors/likwid/groups/arm64fx/MEM.txt
Normal file
@@ -0,0 +1,29 @@
|
||||
SHORT Main memory bandwidth in MBytes/s
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 BUS_READ_TOTAL_MEM
|
||||
PMC3 BUS_WRITE_TOTAL_MEM
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
CPI PMC1/PMC0
|
||||
Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time
|
||||
Memory read data volume [GBytes] 1.0E-09*(PMC2)*256.0
|
||||
Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time
|
||||
Memory write data volume [GBytes] 1.0E-09*(PMC3)*256.0
|
||||
Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time
|
||||
Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Memory read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM)*256.0/runtime
|
||||
Memory read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM)*256.0
|
||||
Memory write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_MEM)*256.0/runtime
|
||||
Memory write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_MEM)*256.0
|
||||
Memory bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0/runtime
|
||||
Memory data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0
|
||||
-
|
||||
Profiling group to measure memory bandwidth. The cache line size is 256 Byte.
|
50
collectors/likwid/groups/arm64fx/MEM_DP.txt
Normal file
50
collectors/likwid/groups/arm64fx/MEM_DP.txt
Normal file
@@ -0,0 +1,50 @@
|
||||
SHORT Overview of arithmetic and main memory performance
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 BUS_READ_TOTAL_MEM
|
||||
PMC3 BUS_WRITE_TOTAL_MEM
|
||||
PMC4 FP_DP_FIXED_OPS_SPEC
|
||||
PMC5 FP_DP_SCALE_OPS_SPEC
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
CPI PMC1/PMC0
|
||||
DP (FP) [MFLOP/s] 1E-06*(PMC4)/time
|
||||
DP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC5*128.0)/128.0)+PMC4)/time
|
||||
DP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC5*256.0)/128.0)+PMC4)/time
|
||||
DP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC5*512.0)/128.0)+PMC4)/time
|
||||
Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time
|
||||
Memory read data volume [GBytes] 1.0E-09*(PMC2)*256.0
|
||||
Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time
|
||||
Memory write data volume [GBytes] 1.0E-09*(PMC3)*256.0
|
||||
Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time
|
||||
Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0
|
||||
Operational intensity (FP) PMC4/((PMC2+PMC3)*256.0)
|
||||
Operational intensity (FP+SVE128) (((PMC5*128.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0)
|
||||
Operational intensity (FP+SVE256) (((PMC5*256.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0)
|
||||
Operational intensity (FP+SVE512) (((PMC5*512.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0)
|
||||
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
DP (FP) [MFLOP/s] = 1E-06*FP_DP_FIXED_OPS_SPEC/time
|
||||
DP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*128)/128))/time
|
||||
DP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*256)/128))/time
|
||||
DP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*512)/128))/time
|
||||
Memory read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM)*256.0/runtime
|
||||
Memory read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM)*256.0
|
||||
Memory write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_MEM)*256.0/runtime
|
||||
Memory write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_MEM)*256.0
|
||||
Memory bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0/runtime
|
||||
Memory data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0
|
||||
Operational intensity (FP) = FP_DP_FIXED_OPS_SPEC/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
|
||||
Operational intensity (FP+SVE128) = (FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*128)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
|
||||
Operational intensity (FP+SVE256) = (FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*256)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
|
||||
Operational intensity (FP+SVE512) = (FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*512)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
|
||||
-
|
||||
Profiling group to measure memory bandwidth and double-precision FP rate for scalar and SVE vector
|
||||
operations with different widths. The events for the SVE metrics assumes that all vector elements
|
||||
are active. The cache line size is 256 Byte.
|
50
collectors/likwid/groups/arm64fx/MEM_HP.txt
Normal file
50
collectors/likwid/groups/arm64fx/MEM_HP.txt
Normal file
@@ -0,0 +1,50 @@
|
||||
SHORT Overview of arithmetic and main memory performance
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 BUS_READ_TOTAL_MEM
|
||||
PMC3 BUS_WRITE_TOTAL_MEM
|
||||
PMC4 FP_HP_FIXED_OPS_HPEC
|
||||
PMC5 FP_HP_SCALE_OPS_HPEC
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
CPI PMC1/PMC0
|
||||
HP (FP) [MFLOP/s] 1E-06*(PMC4)/time
|
||||
HP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC5*128.0)/128.0)+PMC4)/time
|
||||
HP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC5*256.0)/128.0)+PMC4)/time
|
||||
HP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC5*512.0)/128.0)+PMC4)/time
|
||||
Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time
|
||||
Memory read data volume [GBytes] 1.0E-09*(PMC2)*256.0
|
||||
Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time
|
||||
Memory write data volume [GBytes] 1.0E-09*(PMC3)*256.0
|
||||
Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time
|
||||
Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0
|
||||
Operational intensity (FP) PMC4/((PMC2+PMC3)*256.0)
|
||||
Operational intensity (FP+SVE128) (((PMC5*128.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0)
|
||||
Operational intensity (FP+SVE256) (((PMC5*256.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0)
|
||||
Operational intensity (FP+SVE512) (((PMC5*512.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0)
|
||||
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
HP (FP) [MFLOP/s] = 1E-06*FP_HP_FIXED_OPS_HPEC/time
|
||||
HP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*128)/128))/time
|
||||
HP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*256)/128))/time
|
||||
HP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*512)/128))/time
|
||||
Memory read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM)*256.0/runtime
|
||||
Memory read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM)*256.0
|
||||
Memory write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_MEM)*256.0/runtime
|
||||
Memory write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_MEM)*256.0
|
||||
Memory bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0/runtime
|
||||
Memory data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0
|
||||
Operational intensity (FP) = FP_HP_FIXED_OPS_HPEC/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
|
||||
Operational intensity (FP+SVE128) = (FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*128)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
|
||||
Operational intensity (FP+SVE256) = (FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*256)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
|
||||
Operational intensity (FP+SVE512) = (FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*512)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
|
||||
-
|
||||
Profiling group to measure memory bandwidth and half-precision FP rate for scalar and SVE vector
|
||||
operations with different widths. The events for the SVE metrics assumes that all vector elements
|
||||
are active. The cache line size is 256 Byte.
|
50
collectors/likwid/groups/arm64fx/MEM_SP.txt
Normal file
50
collectors/likwid/groups/arm64fx/MEM_SP.txt
Normal file
@@ -0,0 +1,50 @@
|
||||
SHORT Overview of arithmetic and main memory performance
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 BUS_READ_TOTAL_MEM
|
||||
PMC3 BUS_WRITE_TOTAL_MEM
|
||||
PMC4 FP_SP_FIXED_OPS_SPEC
|
||||
PMC5 FP_SP_SCALE_OPS_SPEC
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
CPI PMC1/PMC0
|
||||
SP (FP) [MFLOP/s] 1E-06*(PMC4)/time
|
||||
SP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC5*128.0)/128.0)+PMC4)/time
|
||||
SP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC5*256.0)/128.0)+PMC4)/time
|
||||
SP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC5*512.0)/128.0)+PMC4)/time
|
||||
Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time
|
||||
Memory read data volume [GBytes] 1.0E-09*(PMC2)*256.0
|
||||
Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time
|
||||
Memory write data volume [GBytes] 1.0E-09*(PMC3)*256.0
|
||||
Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time
|
||||
Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0
|
||||
Operational intensity (FP) PMC4/((PMC2+PMC3)*256.0)
|
||||
Operational intensity (FP+SVE128) (((PMC5*128.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0)
|
||||
Operational intensity (FP+SVE256) (((PMC5*256.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0)
|
||||
Operational intensity (FP+SVE512) (((PMC5*512.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0)
|
||||
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
SP (FP) [MFLOP/s] = 1E-06*FP_SP_FIXED_OPS_SPEC/time
|
||||
SP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*128)/128))/time
|
||||
SP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*256)/128))/time
|
||||
SP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*512)/128))/time
|
||||
Memory read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM)*256.0/runtime
|
||||
Memory read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM)*256.0
|
||||
Memory write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_MEM)*256.0/runtime
|
||||
Memory write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_MEM)*256.0
|
||||
Memory bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0/runtime
|
||||
Memory data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0
|
||||
Operational intensity (FP) = FP_SP_FIXED_OPS_SPEC/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
|
||||
Operational intensity (FP+SVE128) = (FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*128)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
|
||||
Operational intensity (FP+SVE256) = (FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*256)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
|
||||
Operational intensity (FP+SVE512) = (FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*512)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
|
||||
-
|
||||
Profiling group to measure memory bandwidth and single-precision FP rate for scalar and SVE vector
|
||||
operations with different widths. The events for the SVE metrics assumes that all vector elements
|
||||
are active. The cache line size is 256 Byte.
|
29
collectors/likwid/groups/arm64fx/PCI.txt
Normal file
29
collectors/likwid/groups/arm64fx/PCI.txt
Normal file
@@ -0,0 +1,29 @@
|
||||
SHORT PCI bandwidth in MBytes/s
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 BUS_READ_TOTAL_PCI
|
||||
PMC3 BUS_WRITE_TOTAL_PCI
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
CPI PMC1/PMC0
|
||||
PCI read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time
|
||||
PCI read data volume [GBytes] 1.0E-09*(PMC2)*256.0
|
||||
PCI write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time
|
||||
PCI write data volume [GBytes] 1.0E-09*(PMC3)*256.0
|
||||
PCI bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time
|
||||
PCI data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
PCI read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_PCI)*256.0/runtime
|
||||
PCI read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_PCI)*256.0
|
||||
PCI write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_PCI)*256.0/runtime
|
||||
PCI write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_PCI)*256.0
|
||||
PCI bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_PCI+BUS_WRITE_TOTAL_PCI)*256.0/runtime
|
||||
PCI data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_PCI+BUS_WRITE_TOTAL_PCI)*256.0
|
||||
-
|
||||
Profiling group to measure PCI bandwidth. The cache line size is 256 Byte.
|
29
collectors/likwid/groups/arm64fx/TOFU.txt
Normal file
29
collectors/likwid/groups/arm64fx/TOFU.txt
Normal file
@@ -0,0 +1,29 @@
|
||||
SHORT TOFU bandwidth in MBytes/s
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 BUS_READ_TOTAL_TOFU
|
||||
PMC3 BUS_WRITE_TOTAL_TOFU
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
CPI PMC1/PMC0
|
||||
TOFU read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time
|
||||
TOFU read data volume [GBytes] 1.0E-09*(PMC2)*256.0
|
||||
TOFU write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time
|
||||
TOFU write data volume [GBytes] 1.0E-09*(PMC3)*256.0
|
||||
TOFU bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time
|
||||
TOFU data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
TOFU read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_TOFU)*256.0/runtime
|
||||
TOFU read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_TOFU)*256.0
|
||||
TOFU write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_TOFU)*256.0/runtime
|
||||
TOFU write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_TOFU)*256.0
|
||||
TOFU bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_TOFU+BUS_WRITE_TOTAL_TOFU)*256.0/runtime
|
||||
TOFU data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_TOFU+BUS_WRITE_TOTAL_TOFU)*256.0
|
||||
-
|
||||
Profiling group to measure TOFU bandwidth. The cache line size is 256 Byte.
|
31
collectors/likwid/groups/arm8/BRANCH.txt
Normal file
31
collectors/likwid/groups/arm8/BRANCH.txt
Normal file
@@ -0,0 +1,31 @@
|
||||
SHORT Branch prediction miss rate/ratio
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 BR_PRED
|
||||
PMC3 BR_MIS_PRED
|
||||
PMC4 INST_SPEC
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
CPI PMC1/PMC0
|
||||
Branch rate PMC2/PMC0
|
||||
Branch misprediction rate PMC3/PMC0
|
||||
Branch misprediction ratio PMC3/(PMC2+PMC3)
|
||||
Instructions per branch PMC0/(PMC2+PMC3)
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
CPI = CPU_CYCLES/INST_RETIRED
|
||||
Branch rate = BR_PRED/INST_RETIRED
|
||||
Branch misprediction rate = BR_MIS_PRED/INST_RETIRED
|
||||
Branch misprediction ratio = BR_MIS_PRED/(BR_PRED+BR_MIS_PRED)
|
||||
Instructions per branch = INSTR_RETIRED_ANY/(BR_PRED+BR_MIS_PRED)
|
||||
-
|
||||
The rates state how often in average a branch or a mispredicted branch occured
|
||||
per instruction retired in total. The Branch misprediction ratio sets directly
|
||||
into relation what ratio of all branch instruction where mispredicted.
|
||||
Instructions per branch is 1/Branch rate.
|
||||
|
24
collectors/likwid/groups/arm8/DATA.txt
Normal file
24
collectors/likwid/groups/arm8/DATA.txt
Normal file
@@ -0,0 +1,24 @@
|
||||
SHORT Load to store ratio
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 LD_RETIRED
|
||||
PMC3 ST_RETIRED
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
CPI PMC1/PMC0
|
||||
Load to store ratio PMC2/PMC3
|
||||
Load ratio PMC2/PMC0
|
||||
Store ratio PMC3/PMC0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
CPI = CPU_CYCLES/INST_RETIRED
|
||||
Load to store ratio = LD_RETIRED / ST_RETIRED
|
||||
Load ratio = LD_RETIRED / INST_RETIRED
|
||||
Store ratio = ST_RETIRED / INST_RETIRED
|
||||
-
|
||||
This is a metric to determine your load to store ratio.
|
||||
|
24
collectors/likwid/groups/arm8/ICACHE.txt
Normal file
24
collectors/likwid/groups/arm8/ICACHE.txt
Normal file
@@ -0,0 +1,24 @@
|
||||
SHORT Instruction cache miss rate/ratio
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 L1I_CACHE
|
||||
PMC3 L1I_CACHE_REFILL
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
CPI PMC1/PMC0
|
||||
L1I request rate PMC2/PMC0
|
||||
L1I miss rate PMC3/PMC0
|
||||
L1I miss ratio PMC3/PMC2
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
CPI = CPU_CYCLES/INST_RETIRED
|
||||
L1I request rate = L1I_CACHE / INST_RETIRED
|
||||
L1I miss rate = L1I_CACHE_REFILL / INST_RETIRED
|
||||
L1I miss ratio = L1I_CACHE_REFILL / L1I_CACHE
|
||||
-
|
||||
This group measures some L1 instruction cache metrics.
|
40
collectors/likwid/groups/arm8/L2.txt
Normal file
40
collectors/likwid/groups/arm8/L2.txt
Normal file
@@ -0,0 +1,40 @@
|
||||
SHORT L2 cache bandwidth in MBytes/s
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 L1D_CACHE_REFILL
|
||||
PMC3 L1D_CACHE_WB
|
||||
PMC4 L1I_CACHE_REFILL
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
CPI PMC1/PMC0
|
||||
L2D load bandwidth [MBytes/s] 1.0E-06*PMC2*64.0/time
|
||||
L2D load data volume [GBytes] 1.0E-09*PMC2*64.0
|
||||
L2D evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time
|
||||
L2D evict data volume [GBytes] 1.0E-09*PMC3*64.0
|
||||
L2I load bandwidth [MBytes/s] 1.0E-06*PMC4*64.0/time
|
||||
L2I load data volume [GBytes] 1.0E-09*PMC4*64.0
|
||||
L2 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3+PMC4)*64.0/time
|
||||
L2 data volume [GBytes] 1.0E-09*(PMC2+PMC3+PMC4)*64.0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
CPI = CPU_CYCLES/INST_RETIRED
|
||||
L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_REFILL*64.0/time
|
||||
L2D load data volume [GBytes] = 1.0E-09*L1D_CACHE_REFILL*64.0
|
||||
L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_WB*64.0/time
|
||||
L2D evict data volume [GBytes] = 1.0E-09*L1D_CACHE_WB*64.0
|
||||
L2I load bandwidth [MBytes/s] = 1.0E-06*L1I_CACHE_REFILL*64.0/time
|
||||
L2I load data volume [GBytes] = 1.0E-09*L1I_CACHE_REFILL*64.0
|
||||
L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0/time
|
||||
L2 data volume [GBytes] = 1.0E-09*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0
|
||||
-
|
||||
Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
|
||||
number of cacheline loaded from the L2 to the L1 data cache and the writebacks from
|
||||
the L1 data cache to the L2 cache. The group also outputs total data volume transfered between
|
||||
L2 and L1. Note that this bandwidth also includes data transfers due to a write
|
||||
allocate load on a store miss in L1 and cachelines transfered it the instruction
|
||||
cache.
|
30
collectors/likwid/groups/arm8/MEM.txt
Normal file
30
collectors/likwid/groups/arm8/MEM.txt
Normal file
@@ -0,0 +1,30 @@
|
||||
SHORT Main memory bandwidth in MBytes/s
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 L2D_CACHE_REFILL
|
||||
PMC3 L2D_CACHE_WB
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
CPI PMC1/PMC0
|
||||
Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*64.0/time
|
||||
Memory read data volume [GBytes] 1.0E-09*(PMC2)*64.0
|
||||
Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*64.0/time
|
||||
Memory write data volume [GBytes] 1.0E-09*(PMC3)*64.0
|
||||
Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
|
||||
Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Memory read bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL)*64.0/runtime
|
||||
Memory read data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL)*64.0
|
||||
Memory write bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_WB)*64.0/runtime
|
||||
Memory write data volume [GBytes] = 1.0E-09*(L2D_CACHE_WB)*64.0
|
||||
Memory bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL+L2D_CACHE_WB)*64.0/runtime
|
||||
Memory data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL+L2D_CACHE_WB)*64.0
|
||||
-
|
||||
Profiling group to measure memory bandwidth as initiated by the L2 cache.
|
||||
|
31
collectors/likwid/groups/arm8_n1/BRANCH.txt
Normal file
31
collectors/likwid/groups/arm8_n1/BRANCH.txt
Normal file
@@ -0,0 +1,31 @@
|
||||
SHORT Branch prediction miss rate/ratio
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 BR_PRED
|
||||
PMC3 BR_MIS_PRED
|
||||
PMC4 INST_SPEC
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
CPI PMC1/PMC0
|
||||
Branch rate PMC2/PMC0
|
||||
Branch misprediction rate PMC3/PMC0
|
||||
Branch misprediction ratio PMC3/(PMC2+PMC3)
|
||||
Instructions per branch PMC0/(PMC2+PMC3)
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
CPI = CPU_CYCLES/INST_RETIRED
|
||||
Branch rate = BR_PRED/INST_RETIRED
|
||||
Branch misprediction rate = BR_MIS_PRED/INST_RETIRED
|
||||
Branch misprediction ratio = BR_MIS_PRED/(BR_PRED+BR_MIS_PRED)
|
||||
Instructions per branch = INSTR_RETIRED_ANY/(BR_PRED+BR_MIS_PRED)
|
||||
-
|
||||
The rates state how often in average a branch or a mispredicted branch occured
|
||||
per instruction retired in total. The Branch misprediction ratio sets directly
|
||||
into relation what ratio of all branch instruction where mispredicted.
|
||||
Instructions per branch is 1/Branch rate.
|
||||
|
16
collectors/likwid/groups/arm8_n1/CLOCK.txt
Normal file
16
collectors/likwid/groups/arm8_n1/CLOCK.txt
Normal file
@@ -0,0 +1,16 @@
|
||||
SHORT Cycles and instructions
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
CPI PMC1/PMC0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
CPI = CPU_CYCLES/INST_RETIRED
|
||||
-
|
||||
This is a metric to determine cycles per instruction.
|
||||
|
24
collectors/likwid/groups/arm8_n1/DATA.txt
Normal file
24
collectors/likwid/groups/arm8_n1/DATA.txt
Normal file
@@ -0,0 +1,24 @@
|
||||
SHORT Load to store ratio
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 LD_SPEC
|
||||
PMC3 ST_SPEC
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
CPI PMC1/PMC0
|
||||
Load to store ratio PMC2/PMC3
|
||||
Load ratio PMC2/PMC0
|
||||
Store ratio PMC3/PMC0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
CPI = CPU_CYCLES/INST_RETIRED
|
||||
Load to store ratio = LD_SPEC / ST_SPEC
|
||||
Load ratio = LD_SPEC / INST_SPEC
|
||||
Store ratio = ST_SPEC / INST_SPEC
|
||||
-
|
||||
This is a metric to determine your load to store ratio.
|
||||
|
24
collectors/likwid/groups/arm8_n1/ICACHE.txt
Normal file
24
collectors/likwid/groups/arm8_n1/ICACHE.txt
Normal file
@@ -0,0 +1,24 @@
|
||||
SHORT Instruction cache miss rate/ratio
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 L1I_CACHE
|
||||
PMC3 L1I_CACHE_REFILL
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
CPI PMC1/PMC0
|
||||
L1I request rate PMC2/PMC0
|
||||
L1I miss rate PMC3/PMC0
|
||||
L1I miss ratio PMC3/PMC2
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
CPI = CPU_CYCLES/INST_RETIRED
|
||||
L1I request rate = L1I_CACHE / INST_RETIRED
|
||||
L1I miss rate = L1I_CACHE_REFILL / INST_RETIRED
|
||||
L1I miss ratio = L1I_CACHE_REFILL / L1I_CACHE
|
||||
-
|
||||
This group measures some L1 instruction cache metrics.
|
40
collectors/likwid/groups/arm8_n1/L2.txt
Normal file
40
collectors/likwid/groups/arm8_n1/L2.txt
Normal file
@@ -0,0 +1,40 @@
|
||||
SHORT L2 cache bandwidth in MBytes/s
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 L1D_CACHE_REFILL
|
||||
PMC3 L1D_CACHE_WB
|
||||
PMC4 L1I_CACHE_REFILL
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
CPI PMC1/PMC0
|
||||
L2D load bandwidth [MBytes/s] 1.0E-06*PMC2*64.0/time
|
||||
L2D load data volume [GBytes] 1.0E-09*PMC2*64.0
|
||||
L2D evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time
|
||||
L2D evict data volume [GBytes] 1.0E-09*PMC3*64.0
|
||||
L2I load bandwidth [MBytes/s] 1.0E-06*PMC4*64.0/time
|
||||
L2I load data volume [GBytes] 1.0E-09*PMC4*64.0
|
||||
L2 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3+PMC4)*64.0/time
|
||||
L2 data volume [GBytes] 1.0E-09*(PMC2+PMC3+PMC4)*64.0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
CPI = CPU_CYCLES/INST_RETIRED
|
||||
L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_REFILL*64.0/time
|
||||
L2D load data volume [GBytes] = 1.0E-09*L1D_CACHE_REFILL*64.0
|
||||
L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_WB*64.0/time
|
||||
L2D evict data volume [GBytes] = 1.0E-09*L1D_CACHE_WB*64.0
|
||||
L2I load bandwidth [MBytes/s] = 1.0E-06*L1I_CACHE_REFILL*64.0/time
|
||||
L2I load data volume [GBytes] = 1.0E-09*L1I_CACHE_REFILL*64.0
|
||||
L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0/time
|
||||
L2 data volume [GBytes] = 1.0E-09*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0
|
||||
-
|
||||
Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
|
||||
number of cacheline loaded from the L2 to the L1 data cache and the writebacks from
|
||||
the L1 data cache to the L2 cache. The group also outputs total data volume transfered between
|
||||
L2 and L1. Note that this bandwidth also includes data transfers due to a write
|
||||
allocate load on a store miss in L1 and cachelines transfered it the instruction
|
||||
cache.
|
30
collectors/likwid/groups/arm8_n1/L3.txt
Normal file
30
collectors/likwid/groups/arm8_n1/L3.txt
Normal file
@@ -0,0 +1,30 @@
|
||||
SHORT L3 cache bandwidth in MBytes/s
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 L2D_CACHE_REFILL
|
||||
PMC3 L2D_CACHE_WB
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
CPI PMC1/PMC0
|
||||
L3 read bandwidth [MBytes/s] 1.0E-06*(PMC2)*64.0/time
|
||||
L3 read data volume [GBytes] 1.0E-09*(PMC2)*64.0
|
||||
L3 write bandwidth [MBytes/s] 1.0E-06*(PMC3)*64.0/time
|
||||
L3 write data volume [GBytes] 1.0E-09*(PMC3)*64.0
|
||||
L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
|
||||
L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
L3 read bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL)*64.0/runtime
|
||||
L3 read data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL)*64.0
|
||||
L3 write bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_WB)*64.0/runtime
|
||||
L3 write data volume [GBytes] = 1.0E-09*(L2D_CACHE_WB)*64.0
|
||||
L3 bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL+L2D_CACHE_WB)*64.0/runtime
|
||||
L3 data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL+L2D_CACHE_WB)*64.0
|
||||
-
|
||||
Profiling group to measure traffic between L2 and L3 cache.
|
||||
|
29
collectors/likwid/groups/arm8_n1/MEM.txt
Normal file
29
collectors/likwid/groups/arm8_n1/MEM.txt
Normal file
@@ -0,0 +1,29 @@
|
||||
SHORT Main memory bandwidth in MBytes/s
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 MEM_ACCESS_RD
|
||||
PMC3 MEM_ACCESS_WR
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
CPI PMC1/PMC0
|
||||
Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*64.0/time
|
||||
Memory read data volume [GBytes] 1.0E-09*(PMC2)*64.0
|
||||
Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*64.0/time
|
||||
Memory write data volume [GBytes] 1.0E-09*(PMC3)*64.0
|
||||
Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
|
||||
Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Memory read bandwidth [MBytes/s] = 1.0E-06*(MEM_ACCESS_RD)*64.0/runtime
|
||||
Memory read data volume [GBytes] = 1.0E-09*(MEM_ACCESS_RD)*64.0
|
||||
Memory write bandwidth [MBytes/s] = 1.0E-06*(MEM_ACCESS_WR)*64.0/runtime
|
||||
Memory write data volume [GBytes] = 1.0E-09*(MEM_ACCESS_WR)*64.0
|
||||
Memory bandwidth [MBytes/s] = 1.0E-06*(MEM_ACCESS_RD+MEM_ACCESS_WR)*64.0/runtime
|
||||
Memory data volume [GBytes] = 1.0E-09*(MEM_ACCESS_RD+MEM_ACCESS_WR)*64.0
|
||||
-
|
||||
Profiling group to measure memory bandwidth
|
||||
|
30
collectors/likwid/groups/arm8_n1/TLB.txt
Normal file
30
collectors/likwid/groups/arm8_n1/TLB.txt
Normal file
@@ -0,0 +1,30 @@
|
||||
SHORT L1/L2 TLB information
|
||||
|
||||
EVENTSET
|
||||
PMC0 L1D_TLB
|
||||
PMC1 L1I_TLB
|
||||
PMC2 L2D_TLB
|
||||
PMC3 L1D_TLB_REFILL
|
||||
PMC4 L1I_TLB_REFILL
|
||||
PMC5 L2D_TLB_REFILL
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
L1 DTLB accesses PMC0
|
||||
L1 ITLB accesses PMC1
|
||||
L2 DTLB accesses PMC2
|
||||
L1 DTLB refills PMC3
|
||||
L1 ITLB refills PMC4
|
||||
L2 DTLB refills PMC5
|
||||
L1 DTLB refill ratio PMC3/PMC0
|
||||
L1 ITLB refill ratio PMC4/PMC1
|
||||
L1 DTLB refill ratio PMC5/PMC2
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
L1 DTLB refill ratio = L1D_TLB_REFILL / L1D_TLB
|
||||
L1 ITLB refill ratio = L1I_TLB_REFILL / L1I_TLB
|
||||
L2 DTLB refill ratio = L2D_TLB_REFILL / L2D_TLB
|
||||
-
|
||||
This group gives information about the TLB usage for all TLBs:
|
||||
L1 data, L1 instruction and L2 data.
|
32
collectors/likwid/groups/arm8_tx2/BRANCH.txt
Normal file
32
collectors/likwid/groups/arm8_tx2/BRANCH.txt
Normal file
@@ -0,0 +1,32 @@
|
||||
SHORT Branch prediction miss rate/ratio
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 BR_PRED
|
||||
PMC3 BR_MIS_PRED
|
||||
PMC4 INST_SPEC
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Clock [MHz] 1.E-06*PMC1/time
|
||||
CPI PMC1/PMC0
|
||||
Branch rate PMC2/PMC0
|
||||
Branch misprediction rate PMC3/PMC0
|
||||
Branch misprediction ratio PMC3/(PMC2+PMC3)
|
||||
Instructions per branch PMC0/(PMC2+PMC3)
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
CPI = CPU_CYCLES/INST_RETIRED
|
||||
Branch rate = BR_PRED/INST_RETIRED
|
||||
Branch misprediction rate = BR_MIS_PRED/INST_RETIRED
|
||||
Branch misprediction ratio = BR_MIS_PRED/(BR_PRED+BR_MIS_PRED)
|
||||
Instructions per branch = INSTR_RETIRED_ANY/(BR_PRED+BR_MIS_PRED)
|
||||
-
|
||||
The rates state how often in average a branch or a mispredicted branch occured
|
||||
per instruction retired in total. The Branch misprediction ratio sets directly
|
||||
into relation what ratio of all branch instruction where mispredicted.
|
||||
Instructions per branch is 1/Branch rate.
|
||||
|
25
collectors/likwid/groups/arm8_tx2/DATA.txt
Normal file
25
collectors/likwid/groups/arm8_tx2/DATA.txt
Normal file
@@ -0,0 +1,25 @@
|
||||
SHORT Load to store ratio
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 LD_RETIRED
|
||||
PMC3 ST_RETIRED
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Clock [MHz] 1.E-06*PMC1/time
|
||||
CPI PMC1/PMC0
|
||||
Load to store ratio PMC2/PMC3
|
||||
Load ratio PMC2/PMC0
|
||||
Store ratio PMC3/PMC0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
CPI = CPU_CYCLES/INST_RETIRED
|
||||
Load to store ratio = LD_RETIRED / ST_RETIRED
|
||||
Load ratio = LD_RETIRED / INST_RETIRED
|
||||
Store ratio = ST_RETIRED / INST_RETIRED
|
||||
-
|
||||
This is a metric to determine your load to store ratio.
|
||||
|
28
collectors/likwid/groups/arm8_tx2/FLOPS_DP.txt
Normal file
28
collectors/likwid/groups/arm8_tx2/FLOPS_DP.txt
Normal file
@@ -0,0 +1,28 @@
|
||||
SHORT Double Precision MFLOP/s
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 VFP_SPEC
|
||||
PMC3 ASE_SPEC
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Clock [MHz] 1.E-06*PMC1/time
|
||||
CPI PMC1/PMC0
|
||||
DP [MFLOP/s] 1.0E-06*(PMC3*2.0+PMC2)/time
|
||||
NEON DP [MFLOP/s] 1.0E-06*(PMC3*2.0)/time
|
||||
Packed [MUOPS/s] 1.0E-06*(PMC3)/time
|
||||
Scalar [MUOPS/s] 1.0E-06*PMC2/time
|
||||
Vectorization ratio 100*(PMC3)/(PMC2+PMC3)
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
DP [MFLOP/s] = 1.0E-06*(ASE_SPEC*2+VFP_SPEC)/runtime
|
||||
NEON DP [MFLOP/s] = 1.0E-06*(ASE_SPEC*4)/runtime
|
||||
Packed [MUOPS/s] = 1.0E-06*(ASE_SPEC)/runtime
|
||||
Scalar [MUOPS/s] = 1.0E-06*VFP_SPEC/runtime
|
||||
Vectorization ratio = 100*(ASE_SPEC)/(ASE_SPEC+VFP_SPEC)
|
||||
-
|
||||
NEON scalar and packed double precision FLOP rates.
|
||||
|
28
collectors/likwid/groups/arm8_tx2/FLOPS_SP.txt
Normal file
28
collectors/likwid/groups/arm8_tx2/FLOPS_SP.txt
Normal file
@@ -0,0 +1,28 @@
|
||||
SHORT Single Precision MFLOP/s
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 VFP_SPEC
|
||||
PMC3 ASE_SPEC
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Clock [MHz] 1.E-06*PMC1/time
|
||||
CPI PMC1/PMC0
|
||||
SP [MFLOP/s] 1.0E-06*(PMC3*2.0+PMC2)/time
|
||||
NEON SP [MFLOP/s] 1.0E-06*(PMC3*2.0)/time
|
||||
Packed [MUOPS/s] 1.0E-06*(PMC3)/time
|
||||
Scalar [MUOPS/s] 1.0E-06*PMC2/time
|
||||
Vectorization ratio 100*(PMC3)/(PMC2+PMC3)
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
SP [MFLOP/s] = 1.0E-06*(ASE_SPEC*2+VFP_SPEC)/runtime
|
||||
NEON SP [MFLOP/s] = 1.0E-06*(ASE_SPEC*4)/runtime
|
||||
Packed [MUOPS/s] = 1.0E-06*(ASE_SPEC)/runtime
|
||||
Scalar [MUOPS/s] = 1.0E-06*VFP_SPEC/runtime
|
||||
Vectorization ratio = 100*(ASE_SPEC)/(ASE_SPEC+VFP_SPEC)
|
||||
-
|
||||
NEON scalar and packed single precision FLOP rates.
|
||||
|
23
collectors/likwid/groups/arm8_tx2/ICACHE.txt
Normal file
23
collectors/likwid/groups/arm8_tx2/ICACHE.txt
Normal file
@@ -0,0 +1,23 @@
|
||||
SHORT Instruction cache miss rate/ratio
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 L1I_CACHE
|
||||
PMC3 L1I_CACHE_REFILL
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Clock [MHz] 1.E-06*PMC1/time
|
||||
CPI PMC1/PMC0
|
||||
L1I request rate PMC2/PMC0
|
||||
L1I miss rate PMC3/PMC0
|
||||
L1I miss ratio PMC3/PMC2
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
L1I request rate = L1I_CACHE / INST_RETIRED
|
||||
L1I miss rate = L1I_CACHE_REFILL / INST_RETIRED
|
||||
L1I miss ratio = L1I_CACHE_REFILL / L1I_CACHE
|
||||
-
|
||||
This group measures some L1 instruction cache metrics.
|
41
collectors/likwid/groups/arm8_tx2/L2.txt
Normal file
41
collectors/likwid/groups/arm8_tx2/L2.txt
Normal file
@@ -0,0 +1,41 @@
|
||||
SHORT L2 cache bandwidth in MBytes/s
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 L1D_CACHE_REFILL
|
||||
PMC3 L1D_CACHE_WB
|
||||
PMC4 L1I_CACHE_REFILL
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Clock [MHz] 1.E-06*PMC1/time
|
||||
CPI PMC1/PMC0
|
||||
L2D load bandwidth [MBytes/s] 1.0E-06*PMC2*64.0/time
|
||||
L2D load data volume [GBytes] 1.0E-09*PMC2*64.0
|
||||
L2D evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time
|
||||
L2D evict data volume [GBytes] 1.0E-09*PMC3*64.0
|
||||
L2I load bandwidth [MBytes/s] 1.0E-06*PMC4*64.0/time
|
||||
L2I load data volume [GBytes] 1.0E-09*PMC4*64.0
|
||||
L2 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3+PMC4)*64.0/time
|
||||
L2 data volume [GBytes] 1.0E-09*(PMC2+PMC3+PMC4)*64.0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
CPI = CPU_CYCLES/INST_RETIRED
|
||||
L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_REFILL*64.0/time
|
||||
L2D load data volume [GBytes] = 1.0E-09*L1D_CACHE_REFILL*64.0
|
||||
L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_WB*64.0/time
|
||||
L2D evict data volume [GBytes] = 1.0E-09*L1D_CACHE_WB*64.0
|
||||
L2I load bandwidth [MBytes/s] = 1.0E-06*L1I_CACHE_REFILL*64.0/time
|
||||
L2I load data volume [GBytes] = 1.0E-09*L1I_CACHE_REFILL*64.0
|
||||
L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0/time
|
||||
L2 data volume [GBytes] = 1.0E-09*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0
|
||||
-
|
||||
Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
|
||||
number of cacheline loaded from the L2 to the L1 data cache and the writebacks from
|
||||
the L1 data cache to the L2 cache. The group also outputs total data volume transfered between
|
||||
L2 and L1. Note that this bandwidth also includes data transfers due to a write
|
||||
allocate load on a store miss in L1 and cachelines transfered it the instruction
|
||||
cache.
|
32
collectors/likwid/groups/arm8_tx2/L2CACHE.txt
Normal file
32
collectors/likwid/groups/arm8_tx2/L2CACHE.txt
Normal file
@@ -0,0 +1,32 @@
|
||||
SHORT L2 cache miss rate/ratio
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 L2D_CACHE
|
||||
PMC3 L2D_CACHE_REFILL
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Clock [MHz] 1.E-06*PMC1/time
|
||||
CPI PMC1/PMC0
|
||||
L2 request rate PMC2/PMC0
|
||||
L2 miss rate PMC3/PMC0
|
||||
L2 miss ratio PMC3/PMC2
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
L2 request rate = L2D_CACHE/INST_RETIRED
|
||||
L2 miss rate = L2D_CACHE_REFILL/INST_RETIRED
|
||||
L2 miss ratio = L2D_CACHE_REFILL/L2D_CACHE
|
||||
-
|
||||
This group measures the locality of your data accesses with regard to the
|
||||
L2 cache. L2 request rate tells you how data intensive your code is
|
||||
or how many data accesses you have on average per instruction.
|
||||
The L2 miss rate gives a measure how often it was necessary to get
|
||||
cache lines from memory. And finally L2 miss ratio tells you how many of your
|
||||
memory references required a cache line to be loaded from a higher level.
|
||||
While the data cache miss rate might be given by your algorithm you should
|
||||
try to get data cache miss ratio as low as possible by increasing your cache reuse.
|
||||
|
||||
|
38
collectors/likwid/groups/arm8_tx2/L3.txt
Normal file
38
collectors/likwid/groups/arm8_tx2/L3.txt
Normal file
@@ -0,0 +1,38 @@
|
||||
SHORT L3 cache bandwidth in MBytes/s
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 L2D_CACHE_REFILL
|
||||
PMC3 L2D_CACHE_WB
|
||||
PMC4 L2D_CACHE_ALLOCATE
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Clock [MHz] 1.E-06*PMC1/time
|
||||
CPI PMC1/PMC0
|
||||
L3 load bandwidth [MBytes/s] 1.0E-06*(PMC2-PMC4)*64.0/time
|
||||
L3 load data volume [GBytes] 1.0E-09*(PMC2-PMC4)*64.0
|
||||
L3 evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time
|
||||
L3 evict data volume [GBytes] 1.0E-09*PMC3*64.0
|
||||
L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3-PMC4)*64.0/time
|
||||
L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3-PMC4)*64.0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
CPI = CPU_CYCLES/INST_RETIRED
|
||||
L3 load bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL-L2D_CACHE_ALLOCATE)*64.0/time
|
||||
L3 load data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL-L2D_CACHE_ALLOCATE)*64.0
|
||||
L3 evict bandwidth [MBytes/s] = 1.0E-06*L2D_CACHE_WB*64.0/time
|
||||
L3 evict data volume [GBytes] = 1.0E-09*L2D_CACHE_WB*64.0
|
||||
L3 bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL+L2D_CACHE_WB-L2D_CACHE_ALLOCATE))*64.0/time
|
||||
L3 data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL+L2D_CACHE_WB-L2D_CACHE_ALLOCATE))*64.0
|
||||
-
|
||||
Profiling group to measure L2 <-> L3 cache bandwidth. The bandwidth is computed by the
|
||||
number of cache lines loaded from the L3 to the L2 data cache and the writebacks from
|
||||
the L2 data cache to the L3 cache. The group also outputs total data volume transfered between
|
||||
L3 and L2. For streaming-stores, the cache lines are allocated in L2, consequently there
|
||||
is no traffic between L3 and L2 in this case. But the L2D_CACHE_REFILL event counts these
|
||||
allocated cache lines, that's why the value of L2D_CACHE_REFILL is reduced
|
||||
by L2D_CACHE_ALLOCATE.
|
32
collectors/likwid/groups/arm8_tx2/MEM.txt
Normal file
32
collectors/likwid/groups/arm8_tx2/MEM.txt
Normal file
@@ -0,0 +1,32 @@
|
||||
SHORT Main memory bandwidth in MBytes/s
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
MBOX0C0 MEMORY_READS
|
||||
MBOX0C1 MEMORY_WRITES
|
||||
MBOX1C0 MEMORY_READS
|
||||
MBOX1C1 MEMORY_WRITES
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Clock [MHz] 1.E-06*PMC1/time
|
||||
CPI PMC1/PMC0
|
||||
Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0)*64.0/time
|
||||
Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0)*64.0
|
||||
Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1)*64.0/time
|
||||
Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1)*64.0
|
||||
Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1)*64.0/time
|
||||
Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1)*64.0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MEMORY_READS))*64.0/runtime
|
||||
Memory read data volume [GBytes] = 1.0E-09*(SUM(MEMORY_READS))*64.0
|
||||
Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MEMORY_WRITES))*64.0/runtime
|
||||
Memory write data volume [GBytes] = 1.0E-09*(SUM(MEMORY_WRITES))*64.0
|
||||
Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MEMORY_READS)+SUM(MEMORY_WRITES))*64.0/runtime
|
||||
Memory data volume [GBytes] = 1.0E-09*(SUM(MEMORY_READS)+SUM(MEMORY_WRITES))*64.0
|
||||
-
|
||||
Profiling group to measure memory bandwidth. It uses the performance monitoring
|
||||
hardware of the memory controllers.
|
44
collectors/likwid/groups/arm8_tx2/SPEC.txt
Normal file
44
collectors/likwid/groups/arm8_tx2/SPEC.txt
Normal file
@@ -0,0 +1,44 @@
|
||||
SHORT Information about speculative execution
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_SPEC
|
||||
PMC1 LD_SPEC
|
||||
PMC2 ST_SPEC
|
||||
PMC3 DP_SPEC
|
||||
PMC4 VFP_SPEC
|
||||
PMC5 ASE_SPEC
|
||||
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Operations spec. executed PMC0
|
||||
Load ops spec. executed PMC1
|
||||
Store ops spec. executed PMC2
|
||||
Integer data ops spec. executed PMC3
|
||||
Scalar FP ops spec. executed PMC4
|
||||
Vector FP ops spec. executed PMC5
|
||||
Other ops spec. executed (PMC0-PMC1-PMC2-PMC3-PMC4-PMC5)
|
||||
Load ops spec. ratio PMC1/PMC0
|
||||
Store ops spec. ratio PMC2/PMC0
|
||||
Integer data ops spec. ratio PMC3/PMC0
|
||||
Scalar FP ops spec. ratio PMC4/PMC0
|
||||
Vector FP ops spec. ratio PMC5/PMC0
|
||||
Other ops spec. ratio (PMC0-PMC1-PMC2-PMC3-PMC4-PMC5)/PMC0
|
||||
|
||||
|
||||
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Load ops spec. ratio = LD_SPEC / INST_SPEC
|
||||
Store ops spec. ratio = ST_SPEC / INST_SPEC
|
||||
Integer data ops spec. ratio = DP_SPEC / INST_SPEC
|
||||
Scalar FP ops spec. ratio = VFP_SPEC / INST_SPEC
|
||||
Vector FP ops spec. ratio = ASE_SPEC / INST_SPEC
|
||||
Other ops spec. ratio = (INST_SPEC-LD_SPEC-ST_SPEC-DP_SPEC-VFP_SPEC-ASE_SPEC) / INST_SPEC
|
||||
-
|
||||
This group gives information about the speculative execution of micro-ops.
|
||||
It is currently unclear why Other ops spec. executed and ratio is negative
|
||||
in some cases. Although the documentation contains an OP_RETIRED, there is no
|
||||
equivalent OP_SPEC which could be a better reference in this group instead of
|
||||
INST_SPEC.
|
27
collectors/likwid/groups/arm8_tx2/TLB_DATA.txt
Normal file
27
collectors/likwid/groups/arm8_tx2/TLB_DATA.txt
Normal file
@@ -0,0 +1,27 @@
|
||||
SHORT L1 data TLB miss rate/ratio
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 L1D_TLB_REFILL_RD
|
||||
PMC3 L1D_TLB_REFILL_WR
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Clock [MHz] 1.E-06*PMC1/time
|
||||
CPI PMC1/PMC0
|
||||
L1 DTLB load misses PMC2
|
||||
L1 DTLB load miss rate PMC2/PMC0
|
||||
L1 DTLB store misses PMC3
|
||||
L1 DTLB store miss rate PMC3/PMC0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
L1 DTLB load misses = L1D_TLB_REFILL_RD
|
||||
L1 DTLB load miss rate = L1D_TLB_REFILL_RD / INST_RETIRED
|
||||
L1 DTLB store misses = L1D_TLB_REFILL_WR
|
||||
L1 DTLB store miss rate = L1D_TLB_REFILL_WR / INST_RETIRED
|
||||
-
|
||||
The DTLB load and store miss rates gives a measure how often a TLB miss occurred
|
||||
per instruction.
|
||||
|
23
collectors/likwid/groups/arm8_tx2/TLB_INSTR.txt
Normal file
23
collectors/likwid/groups/arm8_tx2/TLB_INSTR.txt
Normal file
@@ -0,0 +1,23 @@
|
||||
SHORT L1 Instruction TLB miss rate/ratio
|
||||
|
||||
EVENTSET
|
||||
PMC0 INST_RETIRED
|
||||
PMC1 CPU_CYCLES
|
||||
PMC2 L1I_TLB_REFILL
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Clock [MHz] 1.E-06*PMC1/time
|
||||
CPI PMC1/PMC0
|
||||
L1 ITLB misses PMC2
|
||||
L1 ITLB miss rate PMC2/PMC0
|
||||
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
L1 ITLB misses = L1I_TLB_REFILL
|
||||
L1 ITLB miss rate = L1I_TLB_REFILL / INST_RETIRED
|
||||
-
|
||||
The ITLB miss rates gives a measure how often a TLB miss occurred
|
||||
per instruction.
|
||||
|
29
collectors/likwid/groups/atom/BRANCH.txt
Normal file
29
collectors/likwid/groups/atom/BRANCH.txt
Normal file
@@ -0,0 +1,29 @@
|
||||
SHORT Branch prediction miss rate/ratio
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
PMC0 BR_INST_RETIRED_ANY
|
||||
PMC1 BR_INST_RETIRED_MISPRED
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Branch rate PMC0/FIXC0
|
||||
Branch misprediction rate PMC1/FIXC0
|
||||
Branch misprediction ratio PMC1/PMC0
|
||||
Instructions per branch FIXC0/PMC0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Branch rate = BR_INST_RETIRED_ANY/INSTR_RETIRED_ANY
|
||||
Branch misprediction rate = BR_INST_RETIRED_MISPRED/INSTR_RETIRED_ANY
|
||||
Branch misprediction ratio = BR_INST_RETIRED_MISPRED/BR_INST_RETIRED_ANY
|
||||
Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ANY
|
||||
-
|
||||
The rates state how often on average a branch or a mispredicted branch occurred
|
||||
per instruction retired in total. The branch misprediction ratio sets directly
|
||||
into relation what ratio of all branch instruction where mispredicted.
|
||||
Instructions per branch is 1/branch rate.
|
||||
|
20
collectors/likwid/groups/atom/DATA.txt
Normal file
20
collectors/likwid/groups/atom/DATA.txt
Normal file
@@ -0,0 +1,20 @@
|
||||
SHORT Load to store ratio
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
PMC0 L1D_CACHE_LD
|
||||
PMC1 L1D_CACHE_ST
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Load to store ratio PMC0/PMC1
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Load to store ratio = L1D_CACHE_LD/L1D_CACHE_ST
|
||||
-
|
||||
This is a simple metric to determine your load to store ratio.
|
||||
|
25
collectors/likwid/groups/atom/FLOPS_DP.txt
Normal file
25
collectors/likwid/groups/atom/FLOPS_DP.txt
Normal file
@@ -0,0 +1,25 @@
|
||||
SHORT Double Precision MFLOP/s
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
PMC0 SIMD_COMP_INST_RETIRED_PACKED_DOUBLE
|
||||
PMC1 SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1)/time
|
||||
Packed [MUOPS/s] 1.0E-06*PMC0/time
|
||||
Scalar [MUOPS/s] 1.0E-06*PMC1/time
|
||||
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
DP [MFLOP/s] = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_DOUBLE*2.0+SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE)/runtime
|
||||
Packed [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_PACKED_DOUBLE/runtime
|
||||
Scalar [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE/runtime
|
||||
--
|
||||
Double Precision [MFLOP/s] Double Precision MFLOP/s
|
||||
|
24
collectors/likwid/groups/atom/FLOPS_SP.txt
Normal file
24
collectors/likwid/groups/atom/FLOPS_SP.txt
Normal file
@@ -0,0 +1,24 @@
|
||||
SHORT Single Precision MFLOP/s
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
PMC0 SIMD_COMP_INST_RETIRED_PACKED_SINGLE
|
||||
PMC1 SIMD_COMP_INST_RETIRED_SCALAR_SINGLE
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
SP [MFLOP/s] (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
|
||||
Packed [MUOPS/s] 1.0E-06*(PMC0)/time
|
||||
Scalar [MUOPS/s] 1.0E-06*PMC1/time
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
SP [MFLOP/s] = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_DOUBLE*4.0+SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE)/runtime
|
||||
Packed [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_PACKED_SINGLE/runtime
|
||||
Scalar [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_SCALAR_SINGLE/runtime
|
||||
--
|
||||
Single Precision MFLOP/s Double Precision MFLOP/s
|
||||
|
19
collectors/likwid/groups/atom/FLOPS_X87.txt
Normal file
19
collectors/likwid/groups/atom/FLOPS_X87.txt
Normal file
@@ -0,0 +1,19 @@
|
||||
SHORT X87 MFLOP/s
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
PMC0 X87_COMP_OPS_EXE_ANY_AR
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
X87 [MFLOP/s] 1.0E-06*PMC0/time
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
X87 [MFLOP/s] = 1.0E-06*X87_COMP_OPS_EXE_ANY_AR/runtime
|
||||
--
|
||||
The MFLOP/s made with X87 instructions
|
||||
|
21
collectors/likwid/groups/atom/MEM.txt
Normal file
21
collectors/likwid/groups/atom/MEM.txt
Normal file
@@ -0,0 +1,21 @@
|
||||
SHORT Main memory bandwidth in MBytes/s
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
PMC0 BUS_TRANS_MEM_THIS_CORE_THIS_A
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Memory bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
|
||||
Memory data volume [GBytes] 1.0E-09*PMC0*64.0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Memory bandwidth [MBytes/s] = 1.0E-06*BUS_TRANS_MEM_THIS_CORE_THIS_A*64/time
|
||||
Memory data volume [GBytes] = 1.0E-09*BUS_TRANS_MEM_THIS_CORE_THIS_A*64.0
|
||||
-
|
||||
Profiling group to measure memory bandwidth drawn by this core.
|
||||
|
21
collectors/likwid/groups/atom/TLB.txt
Normal file
21
collectors/likwid/groups/atom/TLB.txt
Normal file
@@ -0,0 +1,21 @@
|
||||
SHORT TLB miss rate
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
PMC0 DATA_TLB_MISSES_DTLB_MISS
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
DTLB misses PMC0
|
||||
DTLB miss rate PMC0/FIXC0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
DTLB misses = DATA_TLB_MISSES_DTLB_MISS
|
||||
DTLB miss rate = DATA_TLB_MISSES_DTLB_MISS/INSTR_RETIRED_ANY
|
||||
--
|
||||
The DTLB miss rate gives a measure how often a TLB miss occurred per instruction.
|
||||
|
31
collectors/likwid/groups/broadwell/BRANCH.txt
Normal file
31
collectors/likwid/groups/broadwell/BRANCH.txt
Normal file
@@ -0,0 +1,31 @@
|
||||
SHORT Branch prediction miss rate/ratio
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PMC0 BR_INST_RETIRED_ALL_BRANCHES
|
||||
PMC1 BR_MISP_RETIRED_ALL_BRANCHES
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
CPI FIXC1/FIXC0
|
||||
Branch rate PMC0/FIXC0
|
||||
Branch misprediction rate PMC1/FIXC0
|
||||
Branch misprediction ratio PMC1/PMC0
|
||||
Instructions per branch FIXC0/PMC0
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
|
||||
Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
|
||||
Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
|
||||
Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
|
||||
-
|
||||
The rates state how often on average a branch or a mispredicted branch occurred
|
||||
per instruction retired in total. The branch misprediction ratio sets directly
|
||||
into relation what ratio of all branch instruction where mispredicted.
|
||||
Instructions per branch is 1/branch rate.
|
||||
|
26
collectors/likwid/groups/broadwell/CLOCK.txt
Normal file
26
collectors/likwid/groups/broadwell/CLOCK.txt
Normal file
@@ -0,0 +1,26 @@
|
||||
SHORT Power and Energy consumption
|
||||
|
||||
EVENTSET
|
||||
FIXC0 INSTR_RETIRED_ANY
|
||||
FIXC1 CPU_CLK_UNHALTED_CORE
|
||||
FIXC2 CPU_CLK_UNHALTED_REF
|
||||
PWR0 PWR_PKG_ENERGY
|
||||
UBOXFIX UNCORE_CLOCK
|
||||
|
||||
METRICS
|
||||
Runtime (RDTSC) [s] time
|
||||
Runtime unhalted [s] FIXC1*inverseClock
|
||||
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
|
||||
Uncore Clock [MHz] 1.E-06*UBOXFIX/time
|
||||
CPI FIXC1/FIXC0
|
||||
Energy [J] PWR0
|
||||
Power [W] PWR0/time
|
||||
|
||||
LONG
|
||||
Formulas:
|
||||
Power = PWR_PKG_ENERGY / time
|
||||
Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
|
||||
-
|
||||
Broadwell implements the new RAPL interface. This interface enables to
|
||||
monitor the consumed energy on the package (socket) level.
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user