Add likwid collector

This commit is contained in:
Thomas Roehl 2021-03-25 14:47:10 +01:00
parent 4fddcb9741
commit a6ac0c5373
670 changed files with 24926 additions and 0 deletions

301
collectors/likwid/bstrlib.h Normal file
View File

@ -0,0 +1,301 @@
/*
* =======================================================================================
* This source file is part of the bstring string library. This code was
* written by Paul Hsieh in 2002-2008, and is covered by the BSD open source
* license and the GPL. Refer to the accompanying documentation for details
* on usage and license.
*/
/*
* bstrlib.c
*
* This file is the core module for implementing the bstring functions.
*/
#ifndef BSTRLIB_INCLUDE
#define BSTRLIB_INCLUDE
#ifdef __cplusplus
extern "C" {
#endif
#include <stdarg.h>
#include <string.h>
#include <limits.h>
#include <ctype.h>
#if !defined (BSTRLIB_VSNP_OK) && !defined (BSTRLIB_NOVSNP)
# if defined (__TURBOC__) && !defined (__BORLANDC__)
# define BSTRLIB_NOVSNP
# endif
#endif
#define BSTR_ERR (-1)
#define BSTR_OK (0)
#define BSTR_BS_BUFF_LENGTH_GET (0)
typedef struct tagbstring * bstring;
typedef const struct tagbstring * const_bstring;
/* Copy functions */
#define cstr2bstr bfromcstr
extern bstring bfromcstr (const char * str);
extern bstring bfromcstralloc (int mlen, const char * str);
extern bstring blk2bstr (const void * blk, int len);
extern char * bstr2cstr (const_bstring s, char z);
extern int bcstrfree (char * s);
extern bstring bstrcpy (const_bstring b1);
extern int bassign (bstring a, const_bstring b);
extern int bassignmidstr (bstring a, const_bstring b, int left, int len);
extern int bassigncstr (bstring a, const char * str);
extern int bassignblk (bstring a, const void * s, int len);
/* Destroy function */
extern int bdestroy (bstring b);
/* Space allocation hinting functions */
extern int balloc (bstring s, int len);
extern int ballocmin (bstring b, int len);
/* Substring extraction */
extern bstring bmidstr (const_bstring b, int left, int len);
/* Various standard manipulations */
extern int bconcat (bstring b0, const_bstring b1);
extern int bconchar (bstring b0, char c);
extern int bcatcstr (bstring b, const char * s);
extern int bcatblk (bstring b, const void * s, int len);
extern int binsert (bstring s1, int pos, const_bstring s2, unsigned char fill);
extern int binsertch (bstring s1, int pos, int len, unsigned char fill);
extern int breplace (bstring b1, int pos, int len, const_bstring b2, unsigned char fill);
extern int bdelete (bstring s1, int pos, int len);
extern int bsetstr (bstring b0, int pos, const_bstring b1, unsigned char fill);
extern int btrunc (bstring b, int n);
/* Scan/search functions */
extern int bstricmp (const_bstring b0, const_bstring b1);
extern int bstrnicmp (const_bstring b0, const_bstring b1, int n);
extern int biseqcaseless (const_bstring b0, const_bstring b1);
extern int bisstemeqcaselessblk (const_bstring b0, const void * blk, int len);
extern int biseq (const_bstring b0, const_bstring b1);
extern int bisstemeqblk (const_bstring b0, const void * blk, int len);
extern int biseqcstr (const_bstring b, const char * s);
extern int biseqcstrcaseless (const_bstring b, const char * s);
extern int bstrcmp (const_bstring b0, const_bstring b1);
extern int bstrncmp (const_bstring b0, const_bstring b1, int n);
extern int binstr (const_bstring s1, int pos, const_bstring s2);
extern int binstrr (const_bstring s1, int pos, const_bstring s2);
extern int binstrcaseless (const_bstring s1, int pos, const_bstring s2);
extern int binstrrcaseless (const_bstring s1, int pos, const_bstring s2);
extern int bstrchrp (const_bstring b, int c, int pos);
extern int bstrrchrp (const_bstring b, int c, int pos);
#define bstrchr(b,c) bstrchrp ((b), (c), 0)
#define bstrrchr(b,c) bstrrchrp ((b), (c), blength(b)-1)
extern int binchr (const_bstring b0, int pos, const_bstring b1);
extern int binchrr (const_bstring b0, int pos, const_bstring b1);
extern int bninchr (const_bstring b0, int pos, const_bstring b1);
extern int bninchrr (const_bstring b0, int pos, const_bstring b1);
extern int bfindreplace (bstring b, const_bstring find, const_bstring repl, int pos);
extern int bfindreplacecaseless (bstring b, const_bstring find, const_bstring repl, int pos);
/* List of string container functions */
struct bstrList {
int qty, mlen;
bstring * entry;
};
extern struct bstrList * bstrListCreate (void);
extern int bstrListDestroy (struct bstrList * sl);
extern int bstrListAlloc (struct bstrList * sl, int msz);
extern int bstrListAllocMin (struct bstrList * sl, int msz);
/* String split and join functions */
extern struct bstrList * bsplit (const_bstring str, unsigned char splitChar);
extern struct bstrList * bsplits (const_bstring str, const_bstring splitStr);
extern struct bstrList * bsplitstr (const_bstring str, const_bstring splitStr);
extern bstring bjoin (const struct bstrList * bl, const_bstring sep);
extern int bsplitcb (const_bstring str, unsigned char splitChar, int pos,
int (* cb) (void * parm, int ofs, int len), void * parm);
extern int bsplitscb (const_bstring str, const_bstring splitStr, int pos,
int (* cb) (void * parm, int ofs, int len), void * parm);
extern int bsplitstrcb (const_bstring str, const_bstring splitStr, int pos,
int (* cb) (void * parm, int ofs, int len), void * parm);
/* Miscellaneous functions */
extern int bpattern (bstring b, int len);
extern int btoupper (bstring b);
extern int btolower (bstring b);
extern int bltrimws (bstring b);
extern int brtrimws (bstring b);
extern int btrimws (bstring b);
#if !defined (BSTRLIB_NOVSNP)
extern bstring bformat (const char * fmt, ...);
extern int bformata (bstring b, const char * fmt, ...);
extern int bassignformat (bstring b, const char * fmt, ...);
extern int bvcformata (bstring b, int count, const char * fmt, va_list arglist);
#define bvformata(ret, b, fmt, lastarg) { \
bstring bstrtmp_b = (b); \
const char * bstrtmp_fmt = (fmt); \
int bstrtmp_r = BSTR_ERR, bstrtmp_sz = 16; \
for (;;) { \
va_list bstrtmp_arglist; \
va_start (bstrtmp_arglist, lastarg); \
bstrtmp_r = bvcformata (bstrtmp_b, bstrtmp_sz, bstrtmp_fmt, bstrtmp_arglist); \
va_end (bstrtmp_arglist); \
if (bstrtmp_r >= 0) { /* Everything went ok */ \
bstrtmp_r = BSTR_OK; \
break; \
} else if (-bstrtmp_r <= bstrtmp_sz) { /* A real error? */ \
bstrtmp_r = BSTR_ERR; \
break; \
} \
bstrtmp_sz = -bstrtmp_r; /* Doubled or target size */ \
} \
ret = bstrtmp_r; \
}
#endif
typedef int (*bNgetc) (void *parm);
typedef size_t (* bNread) (void *buff, size_t elsize, size_t nelem, void *parm);
/* Input functions */
extern bstring bgets (bNgetc getcPtr, void * parm, char terminator);
extern bstring bread (bNread readPtr, void * parm);
extern int bgetsa (bstring b, bNgetc getcPtr, void * parm, char terminator);
extern int bassigngets (bstring b, bNgetc getcPtr, void * parm, char terminator);
extern int breada (bstring b, bNread readPtr, void * parm);
/* Stream functions */
extern struct bStream * bsopen (bNread readPtr, void * parm);
extern void * bsclose (struct bStream * s);
extern int bsbufflength (struct bStream * s, int sz);
extern int bsreadln (bstring b, struct bStream * s, char terminator);
extern int bsreadlns (bstring r, struct bStream * s, const_bstring term);
extern int bsread (bstring b, struct bStream * s, int n);
extern int bsreadlna (bstring b, struct bStream * s, char terminator);
extern int bsreadlnsa (bstring r, struct bStream * s, const_bstring term);
extern int bsreada (bstring b, struct bStream * s, int n);
extern int bsunread (struct bStream * s, const_bstring b);
extern int bspeek (bstring r, const struct bStream * s);
extern int bssplitscb (struct bStream * s, const_bstring splitStr,
int (* cb) (void * parm, int ofs, const_bstring entry), void * parm);
extern int bssplitstrcb (struct bStream * s, const_bstring splitStr,
int (* cb) (void * parm, int ofs, const_bstring entry), void * parm);
extern int bseof (const struct bStream * s);
struct tagbstring {
int mlen;
int slen;
unsigned char * data;
};
/* Accessor macros */
#define blengthe(b, e) (((b) == (void *)0 || (b)->slen < 0) ? (int)(e) : ((b)->slen))
#define blength(b) (blengthe ((b), 0))
#define bdataofse(b, o, e) (((b) == (void *)0 || (b)->data == (void*)0) ? (char *)(e) : ((char *)(b)->data) + (o))
#define bdataofs(b, o) (bdataofse ((b), (o), (void *)0))
#define bdatae(b, e) (bdataofse (b, 0, e))
#define bdata(b) (bdataofs (b, 0))
#define bchare(b, p, e) ((((unsigned)(p)) < (unsigned)blength(b)) ? ((b)->data[(p)]) : (e))
#define bchar(b, p) bchare ((b), (p), '\0')
/* Static constant string initialization macro */
#define bsStaticMlen(q,m) {(m), (int) sizeof(q)-1, (unsigned char *) ("" q "")}
#if defined(_MSC_VER)
# define bsStatic(q) bsStaticMlen(q,-32)
#endif
#ifndef bsStatic
# define bsStatic(q) bsStaticMlen(q,-__LINE__)
#endif
/* Static constant block parameter pair */
#define bsStaticBlkParms(q) ((void *)("" q "")), ((int) sizeof(q)-1)
/* Reference building macros */
#define cstr2tbstr btfromcstr
#define btfromcstr(t,s) { \
(t).data = (unsigned char *) (s); \
(t).slen = ((t).data) ? ((int) (strlen) ((char *)(t).data)) : 0; \
(t).mlen = -1; \
}
#define blk2tbstr(t,s,l) { \
(t).data = (unsigned char *) (s); \
(t).slen = l; \
(t).mlen = -1; \
}
#define btfromblk(t,s,l) blk2tbstr(t,s,l)
#define bmid2tbstr(t,b,p,l) { \
const_bstring bstrtmp_s = (b); \
if (bstrtmp_s && bstrtmp_s->data && bstrtmp_s->slen >= 0) { \
int bstrtmp_left = (p); \
int bstrtmp_len = (l); \
if (bstrtmp_left < 0) { \
bstrtmp_len += bstrtmp_left; \
bstrtmp_left = 0; \
} \
if (bstrtmp_len > bstrtmp_s->slen - bstrtmp_left) \
bstrtmp_len = bstrtmp_s->slen - bstrtmp_left; \
if (bstrtmp_len <= 0) { \
(t).data = (unsigned char *)""; \
(t).slen = 0; \
} else { \
(t).data = bstrtmp_s->data + bstrtmp_left; \
(t).slen = bstrtmp_len; \
} \
} else { \
(t).data = (unsigned char *)""; \
(t).slen = 0; \
} \
(t).mlen = -__LINE__; \
}
#define btfromblkltrimws(t,s,l) { \
int bstrtmp_idx = 0, bstrtmp_len = (l); \
unsigned char * bstrtmp_s = (s); \
if (bstrtmp_s && bstrtmp_len >= 0) { \
for (; bstrtmp_idx < bstrtmp_len; bstrtmp_idx++) { \
if (!isspace (bstrtmp_s[bstrtmp_idx])) break; \
} \
} \
(t).data = bstrtmp_s + bstrtmp_idx; \
(t).slen = bstrtmp_len - bstrtmp_idx; \
(t).mlen = -__LINE__; \
}
#define btfromblkrtrimws(t,s,l) { \
int bstrtmp_len = (l) - 1; \
unsigned char * bstrtmp_s = (s); \
if (bstrtmp_s && bstrtmp_len >= 0) { \
for (; bstrtmp_len >= 0; bstrtmp_len--) { \
if (!isspace (bstrtmp_s[bstrtmp_len])) break; \
} \
} \
(t).data = bstrtmp_s; \
(t).slen = bstrtmp_len + 1; \
(t).mlen = -__LINE__; \
}
#define btfromblktrimws(t,s,l) { \
int bstrtmp_idx = 0, bstrtmp_len = (l) - 1; \
unsigned char * bstrtmp_s = (s); \
if (bstrtmp_s && bstrtmp_len >= 0) { \
for (; bstrtmp_idx <= bstrtmp_len; bstrtmp_idx++) { \
if (!isspace (bstrtmp_s[bstrtmp_idx])) break; \
} \
for (; bstrtmp_len >= bstrtmp_idx; bstrtmp_len--) { \
if (!isspace (bstrtmp_s[bstrtmp_len])) break; \
} \
} \
(t).data = bstrtmp_s + bstrtmp_idx; \
(t).slen = bstrtmp_len + 1 - bstrtmp_idx; \
(t).mlen = -__LINE__; \
}
/* Write protection macros */
#define bwriteprotect(t) { if ((t).mlen >= 0) (t).mlen = -1; }
#define bwriteallow(t) { if ((t).mlen == -1) (t).mlen = (t).slen + ((t).slen == 0); }
#define biswriteprotected(t) ((t).mlen <= 0)
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,31 @@
SHORT Branch prediction miss rate/ratio
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 BR_INST_RETIRED_ALL_BRANCHES
PMC1 BR_MISP_RETIRED_ALL_BRANCHES
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Branch rate PMC0/FIXC0
Branch misprediction rate PMC1/FIXC0
Branch misprediction ratio PMC1/PMC0
Instructions per branch FIXC0/PMC0
LONG
Formulas:
Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
-
The rates state how often on average a branch or a mispredicted branch occurred
per instruction retired in total. The branch misprediction ratio sets directly
into relation what ratio of all branch instruction where mispredicted.
Instructions per branch is 1/branch rate.

View File

@ -0,0 +1,143 @@
SHORT Cache bandwidth in MBytes/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 L1D_REPLACEMENT
PMC1 L1D_M_EVICT
PMC2 L2_LINES_IN_ALL
PMC3 L2_TRANS_L2_WB
CBOX0C1 LLC_VICTIMS_M_STATE
CBOX1C1 LLC_VICTIMS_M_STATE
CBOX2C1 LLC_VICTIMS_M_STATE
CBOX3C1 LLC_VICTIMS_M_STATE
CBOX4C1 LLC_VICTIMS_M_STATE
CBOX5C1 LLC_VICTIMS_M_STATE
CBOX6C1 LLC_VICTIMS_M_STATE
CBOX7C1 LLC_VICTIMS_M_STATE
CBOX8C1 LLC_VICTIMS_M_STATE
CBOX9C1 LLC_VICTIMS_M_STATE
CBOX10C1 LLC_VICTIMS_M_STATE
CBOX11C1 LLC_VICTIMS_M_STATE
CBOX12C1 LLC_VICTIMS_M_STATE
CBOX13C1 LLC_VICTIMS_M_STATE
CBOX14C1 LLC_VICTIMS_M_STATE
CBOX15C1 LLC_VICTIMS_M_STATE
CBOX16C1 LLC_VICTIMS_M_STATE
CBOX17C1 LLC_VICTIMS_M_STATE
CBOX18C1 LLC_VICTIMS_M_STATE
CBOX19C1 LLC_VICTIMS_M_STATE
CBOX20C1 LLC_VICTIMS_M_STATE
CBOX21C1 LLC_VICTIMS_M_STATE
CBOX22C1 LLC_VICTIMS_M_STATE
CBOX23C1 LLC_VICTIMS_M_STATE
CBOX24C1 LLC_VICTIMS_M_STATE
CBOX25C1 LLC_VICTIMS_M_STATE
CBOX26C1 LLC_VICTIMS_M_STATE
CBOX27C1 LLC_VICTIMS_M_STATE
CBOX0C0 LLC_LOOKUP_DATA_READ
CBOX1C0 LLC_LOOKUP_DATA_READ
CBOX2C0 LLC_LOOKUP_DATA_READ
CBOX3C0 LLC_LOOKUP_DATA_READ
CBOX4C0 LLC_LOOKUP_DATA_READ
CBOX5C0 LLC_LOOKUP_DATA_READ
CBOX6C0 LLC_LOOKUP_DATA_READ
CBOX7C0 LLC_LOOKUP_DATA_READ
CBOX8C0 LLC_LOOKUP_DATA_READ
CBOX9C0 LLC_LOOKUP_DATA_READ
CBOX10C0 LLC_LOOKUP_DATA_READ
CBOX11C0 LLC_LOOKUP_DATA_READ
CBOX12C0 LLC_LOOKUP_DATA_READ
CBOX13C0 LLC_LOOKUP_DATA_READ
CBOX14C0 LLC_LOOKUP_DATA_READ
CBOX15C0 LLC_LOOKUP_DATA_READ
CBOX16C0 LLC_LOOKUP_DATA_READ
CBOX17C0 LLC_LOOKUP_DATA_READ
CBOX18C0 LLC_LOOKUP_DATA_READ
CBOX19C0 LLC_LOOKUP_DATA_READ
CBOX20C0 LLC_LOOKUP_DATA_READ
CBOX21C0 LLC_LOOKUP_DATA_READ
CBOX22C0 LLC_LOOKUP_DATA_READ
CBOX23C0 LLC_LOOKUP_DATA_READ
CBOX24C0 LLC_LOOKUP_DATA_READ
CBOX25C0 LLC_LOOKUP_DATA_READ
CBOX26C0 LLC_LOOKUP_DATA_READ
CBOX27C0 LLC_LOOKUP_DATA_READ
MBOX0C0 CAS_COUNT_RD
MBOX0C1 CAS_COUNT_WR
MBOX1C0 CAS_COUNT_RD
MBOX1C1 CAS_COUNT_WR
MBOX2C0 CAS_COUNT_RD
MBOX2C1 CAS_COUNT_WR
MBOX3C0 CAS_COUNT_RD
MBOX3C1 CAS_COUNT_WR
MBOX4C0 CAS_COUNT_RD
MBOX4C1 CAS_COUNT_WR
MBOX5C0 CAS_COUNT_RD
MBOX5C1 CAS_COUNT_WR
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0
L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0
L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
L3 to L2 load bandwidth [MBytes/s] 1.0E-06*PMC2*64.0/time
L3 to L2 load data volume [GBytes] 1.0E-09*PMC2*64.0
L2 to L3 evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time
L2 to L3 evict data volume [GBytes] 1.0E-09*PMC3*64.0
L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0)*64.0/time
System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0)*64.0
L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64/time
L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64
L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64.0/time
L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64.0
Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time
Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0
Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
LONG
Formulas:
L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64
L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64
L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time
L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64
L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time
L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64
L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ))*64/time
System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ))*64
L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M_STATE))*64/time
L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M_STATE))*64
L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M_STATE))*64/time
L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M_STATE))*64
Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time
Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time
Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time
Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
-
Group to measure cache transfers between L1 and Memory. Please notice that the
L3 to/from system metrics contain any traffic to the system (memory,
Intel QPI, etc.) but don't seem to handle anything because commonly memory read
bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth.

View File

@ -0,0 +1,26 @@
SHORT Power and Energy consumption
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PWR0 PWR_PKG_ENERGY
UBOXFIX UNCORE_CLOCK
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
Uncore Clock [MHz] 1.E-06*UBOXFIX/time
CPI FIXC1/FIXC0
Energy [J] PWR0
Power [W] PWR0/time
LONG
Formulas:
Power = PWR_PKG_ENERGY / time
Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
-
Broadwell implements the new RAPL interface. This interface enables to
monitor the consumed energy on the package (socket) level.

View File

@ -0,0 +1,38 @@
SHORT Cycle Activities
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING
PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING
PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING
PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Cycles without execution [%] (PMC3/FIXC1)*100
Cycles without execution due to L1D [%] (PMC2/FIXC1)*100
Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100
LONG
Formulas:
Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
Cycles without execution due to memory loads [%] = CYCLE_ACTIVITY_CYCLES_LDM_PENDING/CPU_CLK_UNHALTED_CORE*100
--
This performance group measures the cycles while waiting for data from the cache
and memory hierarchy.
CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on
any execution port.
CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is
outstanding.
CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is
outstanding.
CYCLE_ACTIVITY_CYCLES_LDM_PENDING: Cycles while memory subsystem has an
outstanding load.

View File

@ -0,0 +1,45 @@
SHORT Cycle Activities (Stalls)
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING
PMC1 CYCLE_ACTIVITY_STALLS_LDM_PENDING
PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING
PMC3 CYCLE_ACTIVITY_STALLS_TOTAL
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Total execution stalls PMC3
Stalls caused by L1D misses [%] (PMC2/PMC3)*100
Stalls caused by L2 misses [%] (PMC0/PMC3)*100
Stalls caused by memory loads [%] (PMC1/PMC3)*100
Execution stall rate [%] (PMC3/FIXC1)*100
Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100
Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100
LONG
Formulas:
Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
Stalls caused by memory loads [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100
Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100
Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100
Stalls caused by memory loads rate [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CPU_CLK_UNHALTED_CORE)*100
--
This performance group measures the stalls caused by data traffic in the cache
hierarchy.
CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls.
CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand
load is outstanding.
CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand
load is outstanding.
CYCLE_ACTIVITY_STALLS_LDM_PENDING: Execution stalls while memory subsystem has
an outstanding load.

View File

@ -0,0 +1,22 @@
SHORT Load to store ratio
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 MEM_INST_RETIRED_ALL_LOADS
PMC1 MEM_INST_RETIRED_ALL_STORES
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Load to store ratio PMC0/PMC1
LONG
Formulas:
Load to store ratio = MEM_INST_RETIRED_ALL_LOADS/MEM_INST_RETIRED_ALL_STORES
-
This is a metric to determine your load to store ratio.

View File

@ -0,0 +1,24 @@
SHORT Divide unit information
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 ARITH_DIVIDER_COUNT
PMC1 ARITH_DIVIDER_ACTIVE
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Number of divide ops PMC0
Avg. divide unit usage duration PMC1/PMC0
LONG
Formulas:
Number of divide ops = ARITH_DIVIDER_COUNT
Avg. divide unit usage duration = ARITH_DIVIDER_ACTIVE/ARITH_DIVIDER_COUNT
--
This performance group measures the average latency of divide operations

View File

@ -0,0 +1,35 @@
SHORT Power and Energy consumption
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
TMP0 TEMP_CORE
PWR0 PWR_PKG_ENERGY
PWR1 PWR_PP0_ENERGY
PWR3 PWR_DRAM_ENERGY
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Temperature [C] TMP0
Energy [J] PWR0
Power [W] PWR0/time
Energy PP0 [J] PWR1
Power PP0 [W] PWR1/time
Energy DRAM [J] PWR3
Power DRAM [W] PWR3/time
LONG
Formulas:
Power = PWR_PKG_ENERGY / time
Power PP0 = PWR_PP0_ENERGY / time
Power DRAM = PWR_DRAM_ENERGY / time
-
Broadwell implements the new RAPL interface. This interface enables to
monitor the consumed energy on the package (socket) and DRAM level.

View File

@ -0,0 +1,25 @@
SHORT Packed AVX MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
PMC1 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
PMC2 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Packed SP [MFLOP/s] 1.0E-06*(PMC0*8.0+PMC2*16.0)/time
Packed DP [MFLOP/s] 1.0E-06*(PMC1*4.0+PMC3*8.0)/time
LONG
Formulas:
Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
-
Packed 32b AVX FLOPs rates.

View File

@ -0,0 +1,34 @@
SHORT Double Precision MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time
AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time
AVX512 DP [MFLOP/s] 1.0E-06*(PMC3*8.0)/time
Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time
Scalar [MUOPS/s] 1.0E-06*PMC1/time
Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
LONG
Formulas:
DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
AVX512 DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime
Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)
-
SSE scalar and packed double precision FLOP rates.

View File

@ -0,0 +1,34 @@
SHORT Single Precision MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE
PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time
AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time
AVX512 SP [MFLOP/s] 1.0E-06*(PMC3*16.0)/time
Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time
Scalar [MUOPS/s] 1.0E-06*PMC1/time
Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
LONG
Formulas:
SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
AVX512 SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime
Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)
-
SSE scalar and packed single precision FLOP rates.

View File

@ -0,0 +1,38 @@
SHORT L2 cache bandwidth in MBytes/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 L1D_REPLACEMENT
PMC1 L1D_M_EVICT
PMC2 ICACHE_64B_IFTAG_MISS
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
L2D load data volume [GBytes] 1.0E-09*PMC0*64.0
L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0
L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
LONG
Formulas:
L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_64B_IFTAG_MISS)*64/time
L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_64B_IFTAG_MISS)*64
-
Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
number of cache line allocated in the L1 and the number of modified cache lines
evicted from the L1. The group also output total data volume transferred between
L2 and L1. Note that this bandwidth also includes data transfers due to a write
allocate load on a store miss in L1 and traffic caused by misses in the
L1 instruction cache.

View File

@ -0,0 +1,34 @@
SHORT L2 cache miss rate/ratio
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 L2_TRANS_ALL_REQUESTS
PMC1 L2_RQSTS_MISS
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
L2 request rate PMC0/FIXC0
L2 miss rate PMC1/FIXC0
L2 miss ratio PMC1/PMC0
LONG
Formulas:
L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
-
This group measures the locality of your data accesses with regard to the
L2 cache. L2 request rate tells you how data intensive your code is
or how many data accesses you have on average per instruction.
The L2 miss rate gives a measure how often it was necessary to get
cache lines from memory. And finally L2 miss ratio tells you how many of your
memory references required a cache line to be loaded from a higher level.
While the# data cache miss rate might be given by your algorithm you should
try to get data cache miss ratio as low as possible by increasing your cache reuse.

View File

@ -0,0 +1,36 @@
SHORT L3 cache bandwidth in MBytes/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 L2_LINES_IN_ALL
PMC1 L2_TRANS_L2_WB
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
L3 load data volume [GBytes] 1.0E-09*PMC0*64.0
L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0
L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
LONG
Formulas:
L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time
L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0
L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
-
Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
number of cache line allocated in the L2 and the number of modified cache lines
evicted from the L2. This group also output data volume transferred between the
L3 and measured cores L2 caches. Note that this bandwidth also includes data
transfers due to a write allocate load on a store miss in L2.

View File

@ -0,0 +1,35 @@
SHORT L3 cache miss rate/ratio
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 MEM_LOAD_RETIRED_L3_HIT
PMC1 MEM_LOAD_RETIRED_L3_MISS
PMC2 UOPS_RETIRED_ALL
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
L3 request rate (PMC0+PMC1)/PMC2
L3 miss rate PMC1/PMC2
L3 miss ratio PMC1/(PMC0+PMC1)
LONG
Formulas:
L3 request rate = (MEM_LOAD_RETIRED_L3_HIT+MEM_LOAD_RETIRED_L3_MISS)/UOPS_RETIRED_ALL
L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/(MEM_LOAD_RETIRED_L3_HIT+MEM_LOAD_RETIRED_L3_MISS)
-
This group measures the locality of your data accesses with regard to the
L3 cache. L3 request rate tells you how data intensive your code is
or how many data accesses you have on average per instruction.
The L3 miss rate gives a measure how often it was necessary to get
cache lines from memory. And finally L3 miss ratio tells you how many of your
memory references required a cache line to be loaded from a higher level.
While the data cache miss rate might be given by your algorithm you should
try to get data cache miss ratio as low as possible by increasing your cache reuse.

View File

@ -0,0 +1,48 @@
SHORT Main memory bandwidth in MBytes/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
MBOX0C0 CAS_COUNT_RD
MBOX0C1 CAS_COUNT_WR
MBOX1C0 CAS_COUNT_RD
MBOX1C1 CAS_COUNT_WR
MBOX2C0 CAS_COUNT_RD
MBOX2C1 CAS_COUNT_WR
MBOX3C0 CAS_COUNT_RD
MBOX3C1 CAS_COUNT_WR
MBOX4C0 CAS_COUNT_RD
MBOX4C1 CAS_COUNT_WR
MBOX5C0 CAS_COUNT_RD
MBOX5C1 CAS_COUNT_WR
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time
Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0
Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
LONG
Formulas:
Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
-
Profiling group to measure memory bandwidth drawn by all cores of a socket.
Since this group is based on Uncore events it is only possible to measure on a
per socket base. Some of the counters may not be available on your system.
Also outputs total data volume transferred from main memory.
The same metrics are provided by the HA group.

View File

@ -0,0 +1,70 @@
SHORT Overview of arithmetic and main memory performance
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PWR0 PWR_PKG_ENERGY
PWR3 PWR_DRAM_ENERGY
PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
MBOX0C0 CAS_COUNT_RD
MBOX0C1 CAS_COUNT_WR
MBOX1C0 CAS_COUNT_RD
MBOX1C1 CAS_COUNT_WR
MBOX2C0 CAS_COUNT_RD
MBOX2C1 CAS_COUNT_WR
MBOX3C0 CAS_COUNT_RD
MBOX3C1 CAS_COUNT_WR
MBOX4C0 CAS_COUNT_RD
MBOX4C1 CAS_COUNT_WR
MBOX5C0 CAS_COUNT_RD
MBOX5C1 CAS_COUNT_WR
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Energy [J] PWR0
Power [W] PWR0/time
Energy DRAM [J] PWR3
Power DRAM [W] PWR3/time
DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time
AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time
Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time
Scalar [MUOPS/s] 1.0E-06*PMC1/time
Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time
Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0
Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
Operational intensity (PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0)
LONG
Formulas:
Power [W] = PWR_PKG_ENERGY/runtime
Power DRAM [W] = PWR_DRAM_ENERGY/runtime
DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime
Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/runtime
Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/runtime
Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/runtime
Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0)
--
Profiling group to measure memory bandwidth drawn by all cores of a socket.
Since this group is based on Uncore events it is only possible to measure on
a per socket base. Also outputs total data volume transferred from main memory.
SSE scalar and packed double precision FLOP rates. Also reports on packed AVX
32b instructions.
The operational intensity is calculated using the FP values of the cores and the
memory data volume of the whole socket. The actual operational intensity for
multiple CPUs can be found in the statistics table in the Sum column.

View File

@ -0,0 +1,70 @@
SHORT Overview of arithmetic and main memory performance
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PWR0 PWR_PKG_ENERGY
PWR3 PWR_DRAM_ENERGY
PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE
PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
MBOX0C0 CAS_COUNT_RD
MBOX0C1 CAS_COUNT_WR
MBOX1C0 CAS_COUNT_RD
MBOX1C1 CAS_COUNT_WR
MBOX2C0 CAS_COUNT_RD
MBOX2C1 CAS_COUNT_WR
MBOX3C0 CAS_COUNT_RD
MBOX3C1 CAS_COUNT_WR
MBOX4C0 CAS_COUNT_RD
MBOX4C1 CAS_COUNT_WR
MBOX5C0 CAS_COUNT_RD
MBOX5C1 CAS_COUNT_WR
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Energy [J] PWR0
Power [W] PWR0/time
Energy DRAM [J] PWR3
Power DRAM [W] PWR3/time
SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time
AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time
Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time
Scalar [MUOPS/s] 1.0E-06*PMC1/time
Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time
Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0
Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
Operational intensity (PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0)
LONG
Formulas:
Power [W] = PWR_PKG_ENERGY/runtime
Power DRAM [W] = PWR_DRAM_ENERGY/runtime
SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime
Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/runtime
Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/runtime
Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/runtime
Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0)
--
Profiling group to measure memory bandwidth drawn by all cores of a socket.
Since this group is based on Uncore events it is only possible to measure on
a per socket base. Also outputs total data volume transferred from main memory.
SSE scalar and packed single precision FLOP rates. Also reports on packed AVX
32b instructions.
The operational intensity is calculated using the FP values of the cores and the
memory data volume of the whole socket. The actual operational intensity for
multiple CPUs can be found in the statistics table in the Sum column.

View File

@ -0,0 +1,46 @@
SHORT Intel Optance DC bandwidth in MBytes/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
MBOX0C0 PMM_CMD1_RD
MBOX0C1 PMM_CMD1_WR
MBOX1C0 PMM_CMD1_RD
MBOX1C1 PMM_CMD1_WR
MBOX2C0 PMM_CMD1_RD
MBOX2C1 PMM_CMD1_WR
MBOX3C0 PMM_CMD1_RD
MBOX3C1 PMM_CMD1_WR
MBOX4C0 PMM_CMD1_RD
MBOX4C1 PMM_CMD1_WR
MBOX5C0 PMM_CMD1_RD
MBOX5C1 PMM_CMD1_WR
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
PMM read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time
PMM read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0
PMM write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
PMM write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
PMM bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
PMM data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
LONG
Formulas:
PMM read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
PMM read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
PMM write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
PMM write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
PMM bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
PMM data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
-
Profiling group to measure data rate and volume for accesses to Intel Optane DC
persistent memory. The Intel Optance DC devices are handled by the memory
controllers but require different events.

View File

@ -0,0 +1,35 @@
SHORT L2 data TLB miss rate/ratio
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 DTLB_LOAD_MISSES_CAUSES_A_WALK
PMC1 DTLB_STORE_MISSES_CAUSES_A_WALK
PMC2 DTLB_LOAD_MISSES_WALK_ACTIVE
PMC3 DTLB_STORE_MISSES_WALK_ACTIVE
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
L1 DTLB load misses PMC0
L1 DTLB load miss rate PMC0/FIXC0
L1 DTLB load miss duration [Cyc] PMC2/PMC0
L1 DTLB store misses PMC1
L1 DTLB store miss rate PMC1/FIXC0
L1 DTLB store miss duration [Cyc] PMC3/PMC1
LONG
Formulas:
L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_ACTIVE / DTLB_LOAD_MISSES_CAUSES_A_WALK
L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_ACTIVE / DTLB_STORE_MISSES_CAUSES_A_WALK
-
The DTLB load and store miss rates gives a measure how often a TLB miss occurred
per instruction. The duration measures the time in cycles how long a walk did take.

View File

@ -0,0 +1,28 @@
SHORT L1 Instruction TLB miss rate/ratio
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 ITLB_MISSES_CAUSES_A_WALK
PMC1 ITLB_MISSES_WALK_ACTIVE
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
L1 ITLB misses PMC0
L1 ITLB miss rate PMC0/FIXC0
L1 ITLB miss duration [Cyc] PMC1/PMC0
LONG
Formulas:
L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_ACTIVE / ITLB_MISSES_CAUSES_A_WALK
-
The ITLB miss rates gives a measure how often a TLB miss occurred
per instruction. The duration measures the time in cycles how long a walk did take.

View File

@ -0,0 +1,48 @@
SHORT Top down cycle allocation
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 UOPS_ISSUED_ANY
PMC1 UOPS_RETIRED_RETIRE_SLOTS
PMC2 IDQ_UOPS_NOT_DELIVERED_CORE
PMC3 INT_MISC_RECOVERY_CYCLES
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
IPC FIXC0/FIXC1
Total Slots 4*FIXC1
Slots Retired PMC1
Fetch Bubbles PMC2
Recovery Bubbles 4*PMC3
Front End [%] PMC2/(4*FIXC1)*100
Speculation [%] (PMC0-PMC1+(4*PMC3))/(4*FIXC1)*100
Retiring [%] PMC1/(4*FIXC1)*100
Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100
LONG
Formulas:
Total Slots = 4*CPU_CLK_UNHALTED_CORE
Slots Retired = UOPS_RETIRED_RETIRE_SLOTS
Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE
Recovery Bubbles = 4*INT_MISC_RECOVERY_CYCLES
Front End [%] = IDQ_UOPS_NOT_DELIVERED_CORE/(4*CPU_CLK_UNHALTED_CORE)*100
Speculation [%] = (UOPS_ISSUED_ANY-UOPS_RETIRED_RETIRE_SLOTS+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)*100
Retiring [%] = UOPS_RETIRED_RETIRE_SLOTS/(4*CPU_CLK_UNHALTED_CORE)*100
Back End [%] = (1-((IDQ_UOPS_NOT_DELIVERED_CORE+UOPS_ISSUED_ANY+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)))*100
--
This performance group measures cycles to determine percentage of time spent in
front end, back end, retiring and speculation. These metrics are published and
verified by Intel. Further information:
Webpage describing Top-Down Method and its usage in Intel vTune:
https://software.intel.com/en-us/vtune-amplifier-help-tuning-applications-using-a-top-down-microarchitecture-analysis-method
Paper by Yasin Ahmad:
https://sites.google.com/site/analysismethods/yasin-pubs/TopDown-Yasin-ISPASS14.pdf?attredirects=0
Slides by Yasin Ahmad:
http://www.cs.technion.ac.il/~erangi/TMA_using_Linux_perf__Ahmad_Yasin.pdf
The performance group was originally published here:
http://perf.mvermeulen.com/2018/04/14/top-down-performance-counter-analysis-part-1-likwid/

View File

@ -0,0 +1,31 @@
SHORT UOPs execution
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 UOPS_EXECUTED_USED_CYCLES
PMC1 UOPS_EXECUTED_STALL_CYCLES
PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES
PMC3:EDGEDETECT UOPS_EXECUTED_STALL_CYCLES
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Used cycles ratio [%] 100*PMC0/PMC2
Unused cycles ratio [%] 100*PMC1/PMC2
Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
LONG
Formulas:
Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT
-
This performance group returns the ratios of used and unused cycles regarding
the execution stage in the pipeline. Used cycles are all cycles where uOPs are
executed while unused cycles refer to pipeline stalls. Moreover, the group
calculates the average stall duration in cycles.

View File

@ -0,0 +1,31 @@
SHORT UOPs issueing
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 UOPS_ISSUED_USED_CYCLES
PMC1 UOPS_ISSUED_STALL_CYCLES
PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES
PMC3:EDGEDETECT UOPS_ISSUED_STALL_CYCLES
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Used cycles ratio [%] 100*PMC0/PMC2
Unused cycles ratio [%] 100*PMC1/PMC2
Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
LONG
Formulas:
Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT
-
This performance group returns the ratios of used and unused cycles regarding
the issue stage in the pipeline. Used cycles are all cycles where uOPs are
issued while unused cycles refer to pipeline stalls. Moreover, the group
calculates the average stall duration in cycles.

View File

@ -0,0 +1,31 @@
SHORT UOPs retirement
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 UOPS_RETIRED_USED_CYCLES
PMC1 UOPS_RETIRED_STALL_CYCLES
PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES
PMC3:EDGEDETECT UOPS_RETIRED_STALL_CYCLES
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Used cycles ratio [%] 100*PMC0/PMC2
Unused cycles ratio [%] 100*PMC1/PMC2
Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
LONG
Formulas:
Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT
-
This performance group returns the ratios of used and unused cycles regarding
the retirement stage in the pipeline (re-order buffer). Used cycles are all
cycles where uOPs are retired while unused cycles refer to pipeline stalls.
Moreover, the group calculates the average stall duration in cycles.

View File

@ -0,0 +1,42 @@
SHORT UPI data traffic
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
SBOX0C0 TXL_FLITS_ALL_DATA
SBOX0C1 RXL_FLITS_ALL_DATA
SBOX1C0 TXL_FLITS_ALL_DATA
SBOX1C1 RXL_FLITS_ALL_DATA
SBOX2C0 TXL_FLITS_ALL_DATA
SBOX2C1 RXL_FLITS_ALL_DATA
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Received data bandwidth [MByte/s] 1.0E-06*((SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0/time
Received data volume [GByte] 1.0E-09*((SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0
Sent data bandwidth [MByte/s] 1.0E-06*((SBOX0C0+SBOX1C0+SBOX2C0)/9.0)*64.0/time
Sent data volume [GByte] 1.0E-09*((SBOX0C0+SBOX1C0+SBOX2C0)/9.0)*64.0
Total data bandwidth [MByte/s] 1.0E-06*((SBOX0C0+SBOX1C0+SBOX2C0+SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0/time
Total data volume [GByte] 1.0E-09*((SBOX0C0+SBOX1C0+SBOX2C0+SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0
LONG
Formulas:
Received data bandwidth [MByte/s] = 1.0E-06*(SUM(RXL_FLITS_ALL_DATA)/9.0)*64.0/runtime
Received data volume [GByte] = 1.0E-09*(SUM(RXL_FLITS_ALL_DATA)/9.0)*64.0
Sent data bandwidth [MByte/s] = 1.0E-06*(SUM(TXL_FLITS_ALL_DATA)/9.0)*64.0/time
Sent data volume [GByte] = 1.0E-09*(SUM(TXL_FLITS_ALL_DATA)/9.0)*64.0
Total data bandwidth [MByte/s] = 1.0E-06*((SUM(RXL_FLITS_ALL_DATA)+SUM(TXL_FLITS_ALL_DATA))/9.0)*64.0/time
Total data volume [GByte] = 1.0E-09*((SUM(RXL_FLITS_ALL_DATA)+SUM(TXL_FLITS_ALL_DATA))/9.0)*64.0
--
This group measures the data traffic on the UPI (socket interconnect). The group
measures all filled data slots (9 slots per 64 Byte data transfer), that's why
the count needs to be divided by 9. These 9 data chunks are not transferred in
a single flit but there is one flit for the header and three flits for the data.
The metrics show higher values as expected because the events count also
different transfers which include data.

View File

@ -0,0 +1,31 @@
SHORT Branch prediction miss rate/ratio
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 BR_INST_RETIRED_ALL_BRANCHES
PMC1 BR_MISP_RETIRED_ALL_BRANCHES
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Branch rate PMC0/FIXC0
Branch misprediction rate PMC1/FIXC0
Branch misprediction ratio PMC1/PMC0
Instructions per branch FIXC0/PMC0
LONG
Formulas:
Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
-
The rates state how often on average a branch or a mispredicted branch occurred
per instruction retired in total. The branch misprediction ratio sets directly
into relation what ratio of all branch instruction where mispredicted.
Instructions per branch is 1/branch rate.

View File

@ -0,0 +1,22 @@
SHORT Load to store ratio
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 MEM_INST_RETIRED_ALL_LOADS
PMC1 MEM_INST_RETIRED_ALL_STORES
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Load to store ratio PMC0/PMC1
LONG
Formulas:
Load to store ratio = MEM_INST_RETIRED_ALL_LOADS/MEM_INST_RETIRED_ALL_STORES
-
This is a metric to determine your load to store ratio.

View File

@ -0,0 +1,24 @@
SHORT Divide unit information
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 ARITH_DIVIDER_COUNT
PMC1 ARITH_DIVIDER_ACTIVE
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Number of divide ops PMC0
Avg. divide unit usage duration PMC1/PMC0
LONG
Formulas:
Number of divide ops = ARITH_DIVIDER_COUNT
Avg. divide unit usage duration = ARITH_DIVIDER_ACTIVE/ARITH_DIVIDER_COUNT
-
This performance group measures the average latency of divide operations

View File

@ -0,0 +1,35 @@
SHORT Power and Energy consumption
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
TMP0 TEMP_CORE
PWR0 PWR_PKG_ENERGY
PWR1 PWR_PP0_ENERGY
PWR3 PWR_DRAM_ENERGY
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Temperature [C] TMP0
Energy [J] PWR0
Power [W] PWR0/time
Energy PP0 [J] PWR1
Power PP0 [W] PWR1/time
Energy DRAM [J] PWR3
Power DRAM [W] PWR3/time
LONG
Formulas:
Power = PWR_PKG_ENERGY / time
Power PP0 = PWR_PP0_ENERGY / time
Power DRAM = PWR_DRAM_ENERGY / time
-
Broadwell implements the new RAPL interface. This interface enables to
monitor the consumed energy on the package (socket) and DRAM level.

View File

@ -0,0 +1,25 @@
SHORT Packed AVX MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
PMC1 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
PMC2 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Packed SP [MFLOP/s] 1.0E-06*(PMC0*8.0+PMC2*16.0)/time
Packed DP [MFLOP/s] 1.0E-06*(PMC1*4.0+PMC3*8.0)/time
LONG
Formulas:
Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
-
Packed 32b AVX FLOPs rates.

View File

@ -0,0 +1,34 @@
SHORT Double Precision MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time
AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time
AVX512 DP [MFLOP/s] 1.0E-06*(PMC3*8.0)/time
Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time
Scalar [MUOPS/s] 1.0E-06*PMC1/time
Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
LONG
Formulas:
DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
AVX512 DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime
Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)
-
SSE scalar and packed double precision FLOP rates.

View File

@ -0,0 +1,34 @@
SHORT Single Precision MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE
PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time
AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time
AVX512 SP [MFLOP/s] 1.0E-06*(PMC3*16.0)/time
Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time
Scalar [MUOPS/s] 1.0E-06*PMC1/time
Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
LONG
Formulas:
SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
AVX512 SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime
Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)
-
SSE scalar and packed single precision FLOP rates.

View File

@ -0,0 +1,32 @@
SHORT Branch prediction miss rate/ratio
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
FIXC3 TOPDOWN_SLOTS
PMC0 BR_INST_RETIRED_ALL_BRANCHES
PMC1 BR_MISP_RETIRED_ALL_BRANCHES
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Branch rate PMC0/FIXC0
Branch misprediction rate PMC1/FIXC0
Branch misprediction ratio PMC1/PMC0
Instructions per branch FIXC0/PMC0
LONG
Formulas:
Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
-
The rates state how often on average a branch or a mispredicted branch occurred
per instruction retired in total. The branch misprediction ratio sets directly
into relation what ratio of all branch instruction where mispredicted.
Instructions per branch is 1/branch rate.

View File

@ -0,0 +1,23 @@
SHORT Load to store ratio
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
FIXC3 TOPDOWN_SLOTS
PMC0 MEM_INST_RETIRED_ALL_LOADS
PMC1 MEM_INST_RETIRED_ALL_STORES
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Load to store ratio PMC0/PMC1
LONG
Formulas:
Load to store ratio = MEM_INST_RETIRED_ALL_LOADS/MEM_INST_RETIRED_ALL_STORES
-
This is a metric to determine your load to store ratio.

View File

@ -0,0 +1,25 @@
SHORT Divide unit information
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
FIXC3 TOPDOWN_SLOTS
PMC0 ARITH_DIVIDER_COUNT
PMC1 ARITH_DIVIDER_ACTIVE
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Number of divide ops PMC0
Avg. divide unit usage duration PMC1/PMC0
LONG
Formulas:
Number of divide ops = ARITH_DIVIDER_COUNT
Avg. divide unit usage duration = ARITH_DIVIDER_ACTIVE/ARITH_DIVIDER_COUNT
-
This performance group measures the average latency of divide operations

View File

@ -0,0 +1,26 @@
SHORT Packed AVX MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
FIXC3 TOPDOWN_SLOTS
PMC0 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
PMC1 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
PMC2 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Packed SP [MFLOP/s] 1.0E-06*(PMC0*8.0+PMC2*16.0)/time
Packed DP [MFLOP/s] 1.0E-06*(PMC1*4.0+PMC3*8.0)/time
LONG
Formulas:
Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
-
Packed 32b AVX FLOPs rates.

View File

@ -0,0 +1,35 @@
SHORT Double Precision MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
FIXC3 TOPDOWN_SLOTS
PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time
AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time
AVX512 DP [MFLOP/s] 1.0E-06*(PMC3*8.0)/time
Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time
Scalar [MUOPS/s] 1.0E-06*PMC1/time
Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
LONG
Formulas:
DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
AVX512 DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime
Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)
-
SSE scalar and packed double precision FLOP rates.

View File

@ -0,0 +1,35 @@
SHORT Single Precision MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
FIXC3 TOPDOWN_SLOTS
PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE
PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time
AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time
AVX512 SP [MFLOP/s] 1.0E-06*(PMC3*16.0)/time
Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time
Scalar [MUOPS/s] 1.0E-06*PMC1/time
Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
LONG
Formulas:
SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
AVX512 SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime
Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)
-
SSE scalar and packed single precision FLOP rates.

View File

@ -0,0 +1,39 @@
SHORT L2 cache bandwidth in MBytes/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
FIXC3 TOPDOWN_SLOTS
PMC0 L1D_REPLACEMENT
PMC1 L2_TRANS_L1D_WB
PMC2 ICACHE_64B_IFTAG_MISS
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
L2D load data volume [GBytes] 1.0E-09*PMC0*64.0
L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0
L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
LONG
Formulas:
L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time
L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0
L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_64B_IFTAG_MISS)*64/time
L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_64B_IFTAG_MISS)*64
-
Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
number of cache line allocated in the L1 and the number of modified cache lines
evicted from the L1. The group also output total data volume transferred between
L2 and L1. Note that this bandwidth also includes data transfers due to a write
allocate load on a store miss in L1 and traffic caused by misses in the
L1 instruction cache.

View File

@ -0,0 +1,31 @@
SHORT Branch prediction miss rate/ratio
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 BR_INST_RETIRED_ALL_BRANCHES
PMC1 BR_MISP_RETIRED_ALL_BRANCHES
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Branch rate PMC0/FIXC0
Branch misprediction rate PMC1/FIXC0
Branch misprediction ratio PMC1/PMC0
Instructions per branch FIXC0/PMC0
LONG
Formulas:
Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
-
The rates state how often on average a branch or a mispredicted branch occurred
per instruction retired in total. The branch misprediction ratio sets directly
into relation what ratio of all branch instruction where mispredicted.
Instructions per branch is 1/branch rate.

View File

@ -0,0 +1,22 @@
SHORT Load to store ratio
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 MEM_INST_RETIRED_ALL_LOADS
PMC1 MEM_INST_RETIRED_ALL_STORES
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Load to store ratio PMC0/PMC1
LONG
Formulas:
Load to store ratio = MEM_INST_RETIRED_ALL_LOADS/MEM_INST_RETIRED_ALL_STORES
-
This is a metric to determine your load to store ratio.

View File

@ -0,0 +1,24 @@
SHORT Divide unit information
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 ARITH_DIVIDER_COUNT
PMC1 ARITH_DIVIDER_ACTIVE
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Number of divide ops PMC0
Avg. divide unit usage duration PMC1/PMC0
LONG
Formulas:
Number of divide ops = ARITH_DIVIDER_COUNT
Avg. divide unit usage duration = ARITH_DIVIDER_ACTIVE/ARITH_DIVIDER_COUNT
-
This performance group measures the average latency of divide operations

View File

@ -0,0 +1,35 @@
SHORT Power and Energy consumption
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
TMP0 TEMP_CORE
PWR0 PWR_PKG_ENERGY
PWR1 PWR_PP0_ENERGY
PWR3 PWR_DRAM_ENERGY
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Temperature [C] TMP0
Energy [J] PWR0
Power [W] PWR0/time
Energy PP0 [J] PWR1
Power PP0 [W] PWR1/time
Energy DRAM [J] PWR3
Power DRAM [W] PWR3/time
LONG
Formulas:
Power = PWR_PKG_ENERGY / time
Power PP0 = PWR_PP0_ENERGY / time
Power DRAM = PWR_DRAM_ENERGY / time
-
Broadwell implements the new RAPL interface. This interface enables to
monitor the consumed energy on the package (socket) and DRAM level.

View File

@ -0,0 +1,25 @@
SHORT Packed AVX MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
PMC1 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
PMC2 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Packed SP [MFLOP/s] 1.0E-06*(PMC0*8.0+PMC2*16.0)/time
Packed DP [MFLOP/s] 1.0E-06*(PMC1*4.0+PMC3*8.0)/time
LONG
Formulas:
Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
-
Packed 32b AVX FLOPs rates.

View File

@ -0,0 +1,34 @@
SHORT Double Precision MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time
AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time
AVX512 DP [MFLOP/s] 1.0E-06*(PMC3*8.0)/time
Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time
Scalar [MUOPS/s] 1.0E-06*PMC1/time
Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
LONG
Formulas:
DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
AVX512 DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime
Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)
-
SSE scalar and packed double precision FLOP rates.

View File

@ -0,0 +1,34 @@
SHORT Single Precision MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE
PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time
AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time
AVX512 SP [MFLOP/s] 1.0E-06*(PMC3*16.0)/time
Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time
Scalar [MUOPS/s] 1.0E-06*PMC1/time
Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
LONG
Formulas:
SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
AVX512 SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime
Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)
-
SSE scalar and packed single precision FLOP rates.

View File

@ -0,0 +1,30 @@
SHORT Branch prediction miss rate/ratio
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 BR_PRED
PMC3 BR_MIS_PRED
METRICS
Runtime (RDTSC) [s] time
CPI PMC1/PMC0
Branch rate PMC2/PMC0
Branch misprediction rate PMC3/PMC0
Branch misprediction ratio PMC3/(PMC2+PMC3)
Instructions per branch PMC0/(PMC2+PMC3)
LONG
Formulas:
CPI = CPU_CYCLES/INST_RETIRED
Branch rate = BR_PRED/INST_RETIRED
Branch misprediction rate = BR_MIS_PRED/INST_RETIRED
Branch misprediction ratio = BR_MIS_PRED/(BR_PRED+BR_MIS_PRED)
Instructions per branch = INSTR_RETIRED_ANY/(BR_PRED+BR_MIS_PRED)
-
The rates state how often in average a branch or a mispredicted branch occured
per instruction retired in total. The Branch misprediction ratio sets directly
into relation what ratio of all branch instruction where mispredicted.
Instructions per branch is 1/Branch rate.

View File

@ -0,0 +1,24 @@
SHORT Load to store ratio
EVENTSET
PMC0 INST_SPEC
PMC1 CPU_CYCLES
PMC2 LD_SPEC
PMC3 ST_SPEC
METRICS
Runtime (RDTSC) [s] time
CPI PMC1/PMC0
Load to store ratio PMC2/PMC3
Load ratio PMC2/PMC0
Store ratio PMC3/PMC0
LONG
Formulas:
CPI = CPU_CYCLES/INST_SPEC
Load to store ratio = LD_SPEC / ST_SPEC
Load ratio = LD_SPEC / INST_SPEC
Store ratio = ST_SPEC / INST_SPEC
-
This is a metric to determine your load to store ratio.

View File

@ -0,0 +1,26 @@
SHORT Double Precision MFLOP/s
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC3 FP_DP_FIXED_OPS_SPEC
PMC4 FP_DP_SCALE_OPS_SPEC
METRICS
Runtime (RDTSC) [s] time
Clock [MHz] 1.E-06*PMC1/time
CPI PMC1/PMC0
DP (FP) [MFLOP/s] 1E-06*(PMC3)/time
DP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC4*128)/128)+PMC3)/time
DP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC4*256)/128)+PMC3)/time
DP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC4*512)/128)+PMC3)/time
LONG
Formulas:
DP (FP) [MFLOP/s] = 1E-06*FP_DP_FIXED_OPS_SPEC/time
DP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*128)/128))/time
DP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*256)/128))/time
DP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*512)/128))/time
-
Double-precision FP rate for scalar and SVE vector operations with different widths. The events for
the SVE metrics assumes that all vector elements are active.

View File

@ -0,0 +1,26 @@
SHORT Half-Precision MFLOP/s
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC3 FP_HP_FIXED_OPS_SPEC
PMC4 FP_HP_SCALE_OPS_SPEC
METRICS
Runtime (RDTSC) [s] time
Clock [MHz] 1.E-06*PMC1/time
CPI PMC1/PMC0
HP (FP) [MFLOP/s] 1E-06*(PMC3)/time
HP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC4*128)/128)+PMC3)/time
HP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC4*256)/128)+PMC3)/time
HP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC4*512)/128)+PMC3)/time
LONG
Formulas:
HP (FP) [MFLOP/s] = 1E-06*FP_HP_FIXED_OPS_SPEC/time
HP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_SPEC+((FP_HP_SCALE_OPS_SPEC*128)/128))/time
HP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_SPEC+((FP_HP_SCALE_OPS_SPEC*256)/128))/time
HP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_SPEC+((FP_HP_SCALE_OPS_SPEC*512)/128))/time
-
Half-precision FP rate for scalar and SVE vector operations with different widths. The events for
the SVE metrics assumes that all vector elements are active.

View File

@ -0,0 +1,26 @@
SHORT Single Precision MFLOP/s
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC3 FP_SP_FIXED_OPS_SPEC
PMC4 FP_SP_SCALE_OPS_SPEC
METRICS
Runtime (RDTSC) [s] time
Clock [MHz] 1.E-06*PMC1/time
CPI PMC1/PMC0
SP (FP) [MFLOP/s] 1E-06*(PMC3)/time
SP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC4*128)/128)+PMC3)/time
SP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC4*256)/128)+PMC3)/time
SP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC4*512)/128)+PMC3)/time
LONG
Formulas:
SP (FP) [MFLOP/s] = 1E-06*FP_SP_FIXED_OPS_SPEC/time
SP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*128)/128))/time
SP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*256)/128))/time
SP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*512)/128))/time
-
Single-precision FP rate for scalar and SVE vector operations with different widths. The events for
the SVE metrics assumes that all vector elements are active.

View File

@ -0,0 +1,33 @@
SHORT Utilization of FP pipelines
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 FLA_VAL
PMC3 FLA_VAL_PRD_CNT
PMC4 FLB_VAL
PMC5 FLB_VAL_PRD_CNT
METRICS
Runtime (RDTSC) [s] time
CPI PMC1/PMC0
FP operation pipeline A busy rate [%] (PMC2/PMC1)*100.0
FP pipeline A active element rate [%] (PMC3/(PMC2*16))*100.0
FP operation pipeline B busy rate [%] (PMC4/PMC1)*100.0
FP pipeline B active element rate [%] (PMC5/(PMC4*16))*100.0
LONG
Formulas:
CPI = CPU_CYCLES/INST_SPEC
FP operation pipeline A busy rate [%] = (FLA_VAL/CPU_CYCLES)*100.0
FP pipeline A active element rate [%] = (FLA_VAL_PRD_CNT/(FLA_VAL*16))*100.0
FP operation pipeline B busy rate [%] = (FLB_VAL/CPU_CYCLES)*100.0
FP pipeline B active element rate [%] = (FLB_VAL_PRD_CNT/(FLB_VAL*16))*100.0
-
FLx_VAL: This event counts valid cycles of FLx pipeline.
FLx_VAL_PRD_CNT: This event counts the number of 1's in the predicate bits of
request in FLA pipeline, where it is corrected so that it
becomes 16 when all bits are 1.
So each predicate mask has 16 slots, so there are 16 slots per cycle in FLA and
FLB. FLA is partly used by other instructions like SVE stores.

View File

@ -0,0 +1,24 @@
SHORT Instruction cache miss rate/ratio
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 L1I_CACHE
PMC3 L1I_CACHE_REFILL
METRICS
Runtime (RDTSC) [s] time
CPI PMC1/PMC0
L1I request rate PMC2/PMC0
L1I miss rate PMC3/PMC0
L1I miss ratio PMC3/PMC2
LONG
Formulas:
CPI = CPU_CYCLES/INST_RETIRED
L1I request rate = L1I_CACHE / INST_RETIRED
L1I miss rate = L1I_CACHE_REFILL / INST_RETIRED
L1I miss ratio = L1I_CACHE_REFILL / L1I_CACHE
-
This group measures some L1 instruction cache metrics.

View File

@ -0,0 +1,40 @@
SHORT L2 cache bandwidth in MBytes/s
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 L1D_CACHE_REFILL
PMC3 L1D_CACHE_WB
PMC4 L1I_CACHE_REFILL
METRICS
Runtime (RDTSC) [s] time
CPI PMC1/PMC0
L1D<-L2 load bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time
L1D<-L2 load data volume [GBytes] 1.0E-09*(PMC2)*256.0
L1D->L2 evict bandwidth [MBytes/s] 1.0E-06*PMC3*256.0/time
L1D->L2 evict data volume [GBytes] 1.0E-09*PMC3*256.0
L1I<-L2 load bandwidth [MBytes/s] 1.0E-06*PMC4*256.0/time
L1I<-L2 load data volume [GBytes] 1.0E-09*PMC4*256.0
L1<->L2 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3+PMC4)*256.0/time
L1<->L2 data volume [GBytes] 1.0E-09*(PMC2+PMC3+PMC4)*256.0
LONG
Formulas:
CPI = CPU_CYCLES/INST_RETIRED
L1D<-L2 load bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_REFILL*256.0/time
L1D<-L2 load data volume [GBytes] = 1.0E-09*L1D_CACHE_REFILL*256.0
L1D->L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_WB*256.0/time
L1D->L2 evict data volume [GBytes] = 1.0E-09*L1D_CACHE_WB*256.0
L1I<-L2 load bandwidth [MBytes/s] = 1.0E-06*L1I_CACHE_REFILL*256.0/time
L1I<-L2 load data volume [GBytes] = 1.0E-09*L1I_CACHE_REFILL*256.0
L1<->L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*256.0/time
L1<->L2 data volume [GBytes] = 1.0E-09*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*256.0
-
Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
number of cacheline loaded from the L2 to the L1 data cache and the writebacks from
the L1 data cache to the L2 cache. The group also outputs total data volume transfered between
L2 and L1. Note that this bandwidth also includes data transfers due to a write
allocate load on a store miss in L1 and cachelines transfered in the L1 instruction
cache.

View File

@ -0,0 +1,29 @@
SHORT Main memory bandwidth in MBytes/s
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 BUS_READ_TOTAL_MEM
PMC3 BUS_WRITE_TOTAL_MEM
METRICS
Runtime (RDTSC) [s] time
CPI PMC1/PMC0
Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time
Memory read data volume [GBytes] 1.0E-09*(PMC2)*256.0
Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time
Memory write data volume [GBytes] 1.0E-09*(PMC3)*256.0
Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time
Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0
LONG
Formulas:
Memory read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM)*256.0/runtime
Memory read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM)*256.0
Memory write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_MEM)*256.0/runtime
Memory write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_MEM)*256.0
Memory bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0/runtime
Memory data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0
-
Profiling group to measure memory bandwidth. The cache line size is 256 Byte.

View File

@ -0,0 +1,50 @@
SHORT Overview of arithmetic and main memory performance
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 BUS_READ_TOTAL_MEM
PMC3 BUS_WRITE_TOTAL_MEM
PMC4 FP_DP_FIXED_OPS_SPEC
PMC5 FP_DP_SCALE_OPS_SPEC
METRICS
Runtime (RDTSC) [s] time
CPI PMC1/PMC0
DP (FP) [MFLOP/s] 1E-06*(PMC4)/time
DP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC5*128.0)/128.0)+PMC4)/time
DP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC5*256.0)/128.0)+PMC4)/time
DP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC5*512.0)/128.0)+PMC4)/time
Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time
Memory read data volume [GBytes] 1.0E-09*(PMC2)*256.0
Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time
Memory write data volume [GBytes] 1.0E-09*(PMC3)*256.0
Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time
Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0
Operational intensity (FP) PMC4/((PMC2+PMC3)*256.0)
Operational intensity (FP+SVE128) (((PMC5*128.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0)
Operational intensity (FP+SVE256) (((PMC5*256.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0)
Operational intensity (FP+SVE512) (((PMC5*512.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0)
LONG
Formulas:
DP (FP) [MFLOP/s] = 1E-06*FP_DP_FIXED_OPS_SPEC/time
DP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*128)/128))/time
DP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*256)/128))/time
DP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*512)/128))/time
Memory read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM)*256.0/runtime
Memory read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM)*256.0
Memory write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_MEM)*256.0/runtime
Memory write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_MEM)*256.0
Memory bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0/runtime
Memory data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0
Operational intensity (FP) = FP_DP_FIXED_OPS_SPEC/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
Operational intensity (FP+SVE128) = (FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*128)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
Operational intensity (FP+SVE256) = (FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*256)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
Operational intensity (FP+SVE512) = (FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*512)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
-
Profiling group to measure memory bandwidth and double-precision FP rate for scalar and SVE vector
operations with different widths. The events for the SVE metrics assumes that all vector elements
are active. The cache line size is 256 Byte.

View File

@ -0,0 +1,50 @@
SHORT Overview of arithmetic and main memory performance
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 BUS_READ_TOTAL_MEM
PMC3 BUS_WRITE_TOTAL_MEM
PMC4 FP_HP_FIXED_OPS_HPEC
PMC5 FP_HP_SCALE_OPS_HPEC
METRICS
Runtime (RDTSC) [s] time
CPI PMC1/PMC0
HP (FP) [MFLOP/s] 1E-06*(PMC4)/time
HP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC5*128.0)/128.0)+PMC4)/time
HP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC5*256.0)/128.0)+PMC4)/time
HP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC5*512.0)/128.0)+PMC4)/time
Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time
Memory read data volume [GBytes] 1.0E-09*(PMC2)*256.0
Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time
Memory write data volume [GBytes] 1.0E-09*(PMC3)*256.0
Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time
Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0
Operational intensity (FP) PMC4/((PMC2+PMC3)*256.0)
Operational intensity (FP+SVE128) (((PMC5*128.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0)
Operational intensity (FP+SVE256) (((PMC5*256.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0)
Operational intensity (FP+SVE512) (((PMC5*512.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0)
LONG
Formulas:
HP (FP) [MFLOP/s] = 1E-06*FP_HP_FIXED_OPS_HPEC/time
HP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*128)/128))/time
HP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*256)/128))/time
HP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*512)/128))/time
Memory read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM)*256.0/runtime
Memory read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM)*256.0
Memory write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_MEM)*256.0/runtime
Memory write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_MEM)*256.0
Memory bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0/runtime
Memory data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0
Operational intensity (FP) = FP_HP_FIXED_OPS_HPEC/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
Operational intensity (FP+SVE128) = (FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*128)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
Operational intensity (FP+SVE256) = (FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*256)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
Operational intensity (FP+SVE512) = (FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*512)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
-
Profiling group to measure memory bandwidth and half-precision FP rate for scalar and SVE vector
operations with different widths. The events for the SVE metrics assumes that all vector elements
are active. The cache line size is 256 Byte.

View File

@ -0,0 +1,50 @@
SHORT Overview of arithmetic and main memory performance
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 BUS_READ_TOTAL_MEM
PMC3 BUS_WRITE_TOTAL_MEM
PMC4 FP_SP_FIXED_OPS_SPEC
PMC5 FP_SP_SCALE_OPS_SPEC
METRICS
Runtime (RDTSC) [s] time
CPI PMC1/PMC0
SP (FP) [MFLOP/s] 1E-06*(PMC4)/time
SP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC5*128.0)/128.0)+PMC4)/time
SP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC5*256.0)/128.0)+PMC4)/time
SP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC5*512.0)/128.0)+PMC4)/time
Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time
Memory read data volume [GBytes] 1.0E-09*(PMC2)*256.0
Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time
Memory write data volume [GBytes] 1.0E-09*(PMC3)*256.0
Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time
Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0
Operational intensity (FP) PMC4/((PMC2+PMC3)*256.0)
Operational intensity (FP+SVE128) (((PMC5*128.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0)
Operational intensity (FP+SVE256) (((PMC5*256.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0)
Operational intensity (FP+SVE512) (((PMC5*512.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0)
LONG
Formulas:
SP (FP) [MFLOP/s] = 1E-06*FP_SP_FIXED_OPS_SPEC/time
SP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*128)/128))/time
SP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*256)/128))/time
SP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*512)/128))/time
Memory read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM)*256.0/runtime
Memory read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM)*256.0
Memory write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_MEM)*256.0/runtime
Memory write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_MEM)*256.0
Memory bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0/runtime
Memory data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0
Operational intensity (FP) = FP_SP_FIXED_OPS_SPEC/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
Operational intensity (FP+SVE128) = (FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*128)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
Operational intensity (FP+SVE256) = (FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*256)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
Operational intensity (FP+SVE512) = (FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*512)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
-
Profiling group to measure memory bandwidth and single-precision FP rate for scalar and SVE vector
operations with different widths. The events for the SVE metrics assumes that all vector elements
are active. The cache line size is 256 Byte.

View File

@ -0,0 +1,29 @@
SHORT PCI bandwidth in MBytes/s
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 BUS_READ_TOTAL_PCI
PMC3 BUS_WRITE_TOTAL_PCI
METRICS
Runtime (RDTSC) [s] time
CPI PMC1/PMC0
PCI read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time
PCI read data volume [GBytes] 1.0E-09*(PMC2)*256.0
PCI write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time
PCI write data volume [GBytes] 1.0E-09*(PMC3)*256.0
PCI bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time
PCI data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0
LONG
Formulas:
PCI read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_PCI)*256.0/runtime
PCI read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_PCI)*256.0
PCI write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_PCI)*256.0/runtime
PCI write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_PCI)*256.0
PCI bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_PCI+BUS_WRITE_TOTAL_PCI)*256.0/runtime
PCI data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_PCI+BUS_WRITE_TOTAL_PCI)*256.0
-
Profiling group to measure PCI bandwidth. The cache line size is 256 Byte.

View File

@ -0,0 +1,29 @@
SHORT TOFU bandwidth in MBytes/s
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 BUS_READ_TOTAL_TOFU
PMC3 BUS_WRITE_TOTAL_TOFU
METRICS
Runtime (RDTSC) [s] time
CPI PMC1/PMC0
TOFU read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time
TOFU read data volume [GBytes] 1.0E-09*(PMC2)*256.0
TOFU write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time
TOFU write data volume [GBytes] 1.0E-09*(PMC3)*256.0
TOFU bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time
TOFU data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0
LONG
Formulas:
TOFU read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_TOFU)*256.0/runtime
TOFU read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_TOFU)*256.0
TOFU write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_TOFU)*256.0/runtime
TOFU write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_TOFU)*256.0
TOFU bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_TOFU+BUS_WRITE_TOTAL_TOFU)*256.0/runtime
TOFU data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_TOFU+BUS_WRITE_TOTAL_TOFU)*256.0
-
Profiling group to measure TOFU bandwidth. The cache line size is 256 Byte.

View File

@ -0,0 +1,31 @@
SHORT Branch prediction miss rate/ratio
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 BR_PRED
PMC3 BR_MIS_PRED
PMC4 INST_SPEC
METRICS
Runtime (RDTSC) [s] time
CPI PMC1/PMC0
Branch rate PMC2/PMC0
Branch misprediction rate PMC3/PMC0
Branch misprediction ratio PMC3/(PMC2+PMC3)
Instructions per branch PMC0/(PMC2+PMC3)
LONG
Formulas:
CPI = CPU_CYCLES/INST_RETIRED
Branch rate = BR_PRED/INST_RETIRED
Branch misprediction rate = BR_MIS_PRED/INST_RETIRED
Branch misprediction ratio = BR_MIS_PRED/(BR_PRED+BR_MIS_PRED)
Instructions per branch = INSTR_RETIRED_ANY/(BR_PRED+BR_MIS_PRED)
-
The rates state how often in average a branch or a mispredicted branch occured
per instruction retired in total. The Branch misprediction ratio sets directly
into relation what ratio of all branch instruction where mispredicted.
Instructions per branch is 1/Branch rate.

View File

@ -0,0 +1,24 @@
SHORT Load to store ratio
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 LD_RETIRED
PMC3 ST_RETIRED
METRICS
Runtime (RDTSC) [s] time
CPI PMC1/PMC0
Load to store ratio PMC2/PMC3
Load ratio PMC2/PMC0
Store ratio PMC3/PMC0
LONG
Formulas:
CPI = CPU_CYCLES/INST_RETIRED
Load to store ratio = LD_RETIRED / ST_RETIRED
Load ratio = LD_RETIRED / INST_RETIRED
Store ratio = ST_RETIRED / INST_RETIRED
-
This is a metric to determine your load to store ratio.

View File

@ -0,0 +1,24 @@
SHORT Instruction cache miss rate/ratio
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 L1I_CACHE
PMC3 L1I_CACHE_REFILL
METRICS
Runtime (RDTSC) [s] time
CPI PMC1/PMC0
L1I request rate PMC2/PMC0
L1I miss rate PMC3/PMC0
L1I miss ratio PMC3/PMC2
LONG
Formulas:
CPI = CPU_CYCLES/INST_RETIRED
L1I request rate = L1I_CACHE / INST_RETIRED
L1I miss rate = L1I_CACHE_REFILL / INST_RETIRED
L1I miss ratio = L1I_CACHE_REFILL / L1I_CACHE
-
This group measures some L1 instruction cache metrics.

View File

@ -0,0 +1,40 @@
SHORT L2 cache bandwidth in MBytes/s
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 L1D_CACHE_REFILL
PMC3 L1D_CACHE_WB
PMC4 L1I_CACHE_REFILL
METRICS
Runtime (RDTSC) [s] time
CPI PMC1/PMC0
L2D load bandwidth [MBytes/s] 1.0E-06*PMC2*64.0/time
L2D load data volume [GBytes] 1.0E-09*PMC2*64.0
L2D evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time
L2D evict data volume [GBytes] 1.0E-09*PMC3*64.0
L2I load bandwidth [MBytes/s] 1.0E-06*PMC4*64.0/time
L2I load data volume [GBytes] 1.0E-09*PMC4*64.0
L2 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3+PMC4)*64.0/time
L2 data volume [GBytes] 1.0E-09*(PMC2+PMC3+PMC4)*64.0
LONG
Formulas:
CPI = CPU_CYCLES/INST_RETIRED
L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_REFILL*64.0/time
L2D load data volume [GBytes] = 1.0E-09*L1D_CACHE_REFILL*64.0
L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_WB*64.0/time
L2D evict data volume [GBytes] = 1.0E-09*L1D_CACHE_WB*64.0
L2I load bandwidth [MBytes/s] = 1.0E-06*L1I_CACHE_REFILL*64.0/time
L2I load data volume [GBytes] = 1.0E-09*L1I_CACHE_REFILL*64.0
L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0/time
L2 data volume [GBytes] = 1.0E-09*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0
-
Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
number of cacheline loaded from the L2 to the L1 data cache and the writebacks from
the L1 data cache to the L2 cache. The group also outputs total data volume transfered between
L2 and L1. Note that this bandwidth also includes data transfers due to a write
allocate load on a store miss in L1 and cachelines transfered it the instruction
cache.

View File

@ -0,0 +1,30 @@
SHORT Main memory bandwidth in MBytes/s
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 L2D_CACHE_REFILL
PMC3 L2D_CACHE_WB
METRICS
Runtime (RDTSC) [s] time
CPI PMC1/PMC0
Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*64.0/time
Memory read data volume [GBytes] 1.0E-09*(PMC2)*64.0
Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*64.0/time
Memory write data volume [GBytes] 1.0E-09*(PMC3)*64.0
Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
LONG
Formulas:
Memory read bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL)*64.0/runtime
Memory read data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL)*64.0
Memory write bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_WB)*64.0/runtime
Memory write data volume [GBytes] = 1.0E-09*(L2D_CACHE_WB)*64.0
Memory bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL+L2D_CACHE_WB)*64.0/runtime
Memory data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL+L2D_CACHE_WB)*64.0
-
Profiling group to measure memory bandwidth as initiated by the L2 cache.

View File

@ -0,0 +1,31 @@
SHORT Branch prediction miss rate/ratio
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 BR_PRED
PMC3 BR_MIS_PRED
PMC4 INST_SPEC
METRICS
Runtime (RDTSC) [s] time
CPI PMC1/PMC0
Branch rate PMC2/PMC0
Branch misprediction rate PMC3/PMC0
Branch misprediction ratio PMC3/(PMC2+PMC3)
Instructions per branch PMC0/(PMC2+PMC3)
LONG
Formulas:
CPI = CPU_CYCLES/INST_RETIRED
Branch rate = BR_PRED/INST_RETIRED
Branch misprediction rate = BR_MIS_PRED/INST_RETIRED
Branch misprediction ratio = BR_MIS_PRED/(BR_PRED+BR_MIS_PRED)
Instructions per branch = INSTR_RETIRED_ANY/(BR_PRED+BR_MIS_PRED)
-
The rates state how often in average a branch or a mispredicted branch occured
per instruction retired in total. The Branch misprediction ratio sets directly
into relation what ratio of all branch instruction where mispredicted.
Instructions per branch is 1/Branch rate.

View File

@ -0,0 +1,16 @@
SHORT Cycles and instructions
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
METRICS
Runtime (RDTSC) [s] time
CPI PMC1/PMC0
LONG
Formulas:
CPI = CPU_CYCLES/INST_RETIRED
-
This is a metric to determine cycles per instruction.

View File

@ -0,0 +1,24 @@
SHORT Load to store ratio
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 LD_SPEC
PMC3 ST_SPEC
METRICS
Runtime (RDTSC) [s] time
CPI PMC1/PMC0
Load to store ratio PMC2/PMC3
Load ratio PMC2/PMC0
Store ratio PMC3/PMC0
LONG
Formulas:
CPI = CPU_CYCLES/INST_RETIRED
Load to store ratio = LD_SPEC / ST_SPEC
Load ratio = LD_SPEC / INST_SPEC
Store ratio = ST_SPEC / INST_SPEC
-
This is a metric to determine your load to store ratio.

View File

@ -0,0 +1,24 @@
SHORT Instruction cache miss rate/ratio
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 L1I_CACHE
PMC3 L1I_CACHE_REFILL
METRICS
Runtime (RDTSC) [s] time
CPI PMC1/PMC0
L1I request rate PMC2/PMC0
L1I miss rate PMC3/PMC0
L1I miss ratio PMC3/PMC2
LONG
Formulas:
CPI = CPU_CYCLES/INST_RETIRED
L1I request rate = L1I_CACHE / INST_RETIRED
L1I miss rate = L1I_CACHE_REFILL / INST_RETIRED
L1I miss ratio = L1I_CACHE_REFILL / L1I_CACHE
-
This group measures some L1 instruction cache metrics.

View File

@ -0,0 +1,40 @@
SHORT L2 cache bandwidth in MBytes/s
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 L1D_CACHE_REFILL
PMC3 L1D_CACHE_WB
PMC4 L1I_CACHE_REFILL
METRICS
Runtime (RDTSC) [s] time
CPI PMC1/PMC0
L2D load bandwidth [MBytes/s] 1.0E-06*PMC2*64.0/time
L2D load data volume [GBytes] 1.0E-09*PMC2*64.0
L2D evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time
L2D evict data volume [GBytes] 1.0E-09*PMC3*64.0
L2I load bandwidth [MBytes/s] 1.0E-06*PMC4*64.0/time
L2I load data volume [GBytes] 1.0E-09*PMC4*64.0
L2 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3+PMC4)*64.0/time
L2 data volume [GBytes] 1.0E-09*(PMC2+PMC3+PMC4)*64.0
LONG
Formulas:
CPI = CPU_CYCLES/INST_RETIRED
L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_REFILL*64.0/time
L2D load data volume [GBytes] = 1.0E-09*L1D_CACHE_REFILL*64.0
L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_WB*64.0/time
L2D evict data volume [GBytes] = 1.0E-09*L1D_CACHE_WB*64.0
L2I load bandwidth [MBytes/s] = 1.0E-06*L1I_CACHE_REFILL*64.0/time
L2I load data volume [GBytes] = 1.0E-09*L1I_CACHE_REFILL*64.0
L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0/time
L2 data volume [GBytes] = 1.0E-09*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0
-
Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
number of cacheline loaded from the L2 to the L1 data cache and the writebacks from
the L1 data cache to the L2 cache. The group also outputs total data volume transfered between
L2 and L1. Note that this bandwidth also includes data transfers due to a write
allocate load on a store miss in L1 and cachelines transfered it the instruction
cache.

View File

@ -0,0 +1,30 @@
SHORT L3 cache bandwidth in MBytes/s
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 L2D_CACHE_REFILL
PMC3 L2D_CACHE_WB
METRICS
Runtime (RDTSC) [s] time
CPI PMC1/PMC0
L3 read bandwidth [MBytes/s] 1.0E-06*(PMC2)*64.0/time
L3 read data volume [GBytes] 1.0E-09*(PMC2)*64.0
L3 write bandwidth [MBytes/s] 1.0E-06*(PMC3)*64.0/time
L3 write data volume [GBytes] 1.0E-09*(PMC3)*64.0
L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
LONG
Formulas:
L3 read bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL)*64.0/runtime
L3 read data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL)*64.0
L3 write bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_WB)*64.0/runtime
L3 write data volume [GBytes] = 1.0E-09*(L2D_CACHE_WB)*64.0
L3 bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL+L2D_CACHE_WB)*64.0/runtime
L3 data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL+L2D_CACHE_WB)*64.0
-
Profiling group to measure traffic between L2 and L3 cache.

View File

@ -0,0 +1,29 @@
SHORT Main memory bandwidth in MBytes/s
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 MEM_ACCESS_RD
PMC3 MEM_ACCESS_WR
METRICS
Runtime (RDTSC) [s] time
CPI PMC1/PMC0
Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*64.0/time
Memory read data volume [GBytes] 1.0E-09*(PMC2)*64.0
Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*64.0/time
Memory write data volume [GBytes] 1.0E-09*(PMC3)*64.0
Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
LONG
Formulas:
Memory read bandwidth [MBytes/s] = 1.0E-06*(MEM_ACCESS_RD)*64.0/runtime
Memory read data volume [GBytes] = 1.0E-09*(MEM_ACCESS_RD)*64.0
Memory write bandwidth [MBytes/s] = 1.0E-06*(MEM_ACCESS_WR)*64.0/runtime
Memory write data volume [GBytes] = 1.0E-09*(MEM_ACCESS_WR)*64.0
Memory bandwidth [MBytes/s] = 1.0E-06*(MEM_ACCESS_RD+MEM_ACCESS_WR)*64.0/runtime
Memory data volume [GBytes] = 1.0E-09*(MEM_ACCESS_RD+MEM_ACCESS_WR)*64.0
-
Profiling group to measure memory bandwidth

View File

@ -0,0 +1,30 @@
SHORT L1/L2 TLB information
EVENTSET
PMC0 L1D_TLB
PMC1 L1I_TLB
PMC2 L2D_TLB
PMC3 L1D_TLB_REFILL
PMC4 L1I_TLB_REFILL
PMC5 L2D_TLB_REFILL
METRICS
Runtime (RDTSC) [s] time
L1 DTLB accesses PMC0
L1 ITLB accesses PMC1
L2 DTLB accesses PMC2
L1 DTLB refills PMC3
L1 ITLB refills PMC4
L2 DTLB refills PMC5
L1 DTLB refill ratio PMC3/PMC0
L1 ITLB refill ratio PMC4/PMC1
L1 DTLB refill ratio PMC5/PMC2
LONG
Formulas:
L1 DTLB refill ratio = L1D_TLB_REFILL / L1D_TLB
L1 ITLB refill ratio = L1I_TLB_REFILL / L1I_TLB
L2 DTLB refill ratio = L2D_TLB_REFILL / L2D_TLB
-
This group gives information about the TLB usage for all TLBs:
L1 data, L1 instruction and L2 data.

View File

@ -0,0 +1,32 @@
SHORT Branch prediction miss rate/ratio
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 BR_PRED
PMC3 BR_MIS_PRED
PMC4 INST_SPEC
METRICS
Runtime (RDTSC) [s] time
Clock [MHz] 1.E-06*PMC1/time
CPI PMC1/PMC0
Branch rate PMC2/PMC0
Branch misprediction rate PMC3/PMC0
Branch misprediction ratio PMC3/(PMC2+PMC3)
Instructions per branch PMC0/(PMC2+PMC3)
LONG
Formulas:
CPI = CPU_CYCLES/INST_RETIRED
Branch rate = BR_PRED/INST_RETIRED
Branch misprediction rate = BR_MIS_PRED/INST_RETIRED
Branch misprediction ratio = BR_MIS_PRED/(BR_PRED+BR_MIS_PRED)
Instructions per branch = INSTR_RETIRED_ANY/(BR_PRED+BR_MIS_PRED)
-
The rates state how often in average a branch or a mispredicted branch occured
per instruction retired in total. The Branch misprediction ratio sets directly
into relation what ratio of all branch instruction where mispredicted.
Instructions per branch is 1/Branch rate.

View File

@ -0,0 +1,25 @@
SHORT Load to store ratio
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 LD_RETIRED
PMC3 ST_RETIRED
METRICS
Runtime (RDTSC) [s] time
Clock [MHz] 1.E-06*PMC1/time
CPI PMC1/PMC0
Load to store ratio PMC2/PMC3
Load ratio PMC2/PMC0
Store ratio PMC3/PMC0
LONG
Formulas:
CPI = CPU_CYCLES/INST_RETIRED
Load to store ratio = LD_RETIRED / ST_RETIRED
Load ratio = LD_RETIRED / INST_RETIRED
Store ratio = ST_RETIRED / INST_RETIRED
-
This is a metric to determine your load to store ratio.

View File

@ -0,0 +1,28 @@
SHORT Double Precision MFLOP/s
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 VFP_SPEC
PMC3 ASE_SPEC
METRICS
Runtime (RDTSC) [s] time
Clock [MHz] 1.E-06*PMC1/time
CPI PMC1/PMC0
DP [MFLOP/s] 1.0E-06*(PMC3*2.0+PMC2)/time
NEON DP [MFLOP/s] 1.0E-06*(PMC3*2.0)/time
Packed [MUOPS/s] 1.0E-06*(PMC3)/time
Scalar [MUOPS/s] 1.0E-06*PMC2/time
Vectorization ratio 100*(PMC3)/(PMC2+PMC3)
LONG
Formulas:
DP [MFLOP/s] = 1.0E-06*(ASE_SPEC*2+VFP_SPEC)/runtime
NEON DP [MFLOP/s] = 1.0E-06*(ASE_SPEC*4)/runtime
Packed [MUOPS/s] = 1.0E-06*(ASE_SPEC)/runtime
Scalar [MUOPS/s] = 1.0E-06*VFP_SPEC/runtime
Vectorization ratio = 100*(ASE_SPEC)/(ASE_SPEC+VFP_SPEC)
-
NEON scalar and packed double precision FLOP rates.

View File

@ -0,0 +1,28 @@
SHORT Single Precision MFLOP/s
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 VFP_SPEC
PMC3 ASE_SPEC
METRICS
Runtime (RDTSC) [s] time
Clock [MHz] 1.E-06*PMC1/time
CPI PMC1/PMC0
SP [MFLOP/s] 1.0E-06*(PMC3*2.0+PMC2)/time
NEON SP [MFLOP/s] 1.0E-06*(PMC3*2.0)/time
Packed [MUOPS/s] 1.0E-06*(PMC3)/time
Scalar [MUOPS/s] 1.0E-06*PMC2/time
Vectorization ratio 100*(PMC3)/(PMC2+PMC3)
LONG
Formulas:
SP [MFLOP/s] = 1.0E-06*(ASE_SPEC*2+VFP_SPEC)/runtime
NEON SP [MFLOP/s] = 1.0E-06*(ASE_SPEC*4)/runtime
Packed [MUOPS/s] = 1.0E-06*(ASE_SPEC)/runtime
Scalar [MUOPS/s] = 1.0E-06*VFP_SPEC/runtime
Vectorization ratio = 100*(ASE_SPEC)/(ASE_SPEC+VFP_SPEC)
-
NEON scalar and packed single precision FLOP rates.

View File

@ -0,0 +1,23 @@
SHORT Instruction cache miss rate/ratio
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 L1I_CACHE
PMC3 L1I_CACHE_REFILL
METRICS
Runtime (RDTSC) [s] time
Clock [MHz] 1.E-06*PMC1/time
CPI PMC1/PMC0
L1I request rate PMC2/PMC0
L1I miss rate PMC3/PMC0
L1I miss ratio PMC3/PMC2
LONG
Formulas:
L1I request rate = L1I_CACHE / INST_RETIRED
L1I miss rate = L1I_CACHE_REFILL / INST_RETIRED
L1I miss ratio = L1I_CACHE_REFILL / L1I_CACHE
-
This group measures some L1 instruction cache metrics.

View File

@ -0,0 +1,41 @@
SHORT L2 cache bandwidth in MBytes/s
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 L1D_CACHE_REFILL
PMC3 L1D_CACHE_WB
PMC4 L1I_CACHE_REFILL
METRICS
Runtime (RDTSC) [s] time
Clock [MHz] 1.E-06*PMC1/time
CPI PMC1/PMC0
L2D load bandwidth [MBytes/s] 1.0E-06*PMC2*64.0/time
L2D load data volume [GBytes] 1.0E-09*PMC2*64.0
L2D evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time
L2D evict data volume [GBytes] 1.0E-09*PMC3*64.0
L2I load bandwidth [MBytes/s] 1.0E-06*PMC4*64.0/time
L2I load data volume [GBytes] 1.0E-09*PMC4*64.0
L2 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3+PMC4)*64.0/time
L2 data volume [GBytes] 1.0E-09*(PMC2+PMC3+PMC4)*64.0
LONG
Formulas:
CPI = CPU_CYCLES/INST_RETIRED
L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_REFILL*64.0/time
L2D load data volume [GBytes] = 1.0E-09*L1D_CACHE_REFILL*64.0
L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_WB*64.0/time
L2D evict data volume [GBytes] = 1.0E-09*L1D_CACHE_WB*64.0
L2I load bandwidth [MBytes/s] = 1.0E-06*L1I_CACHE_REFILL*64.0/time
L2I load data volume [GBytes] = 1.0E-09*L1I_CACHE_REFILL*64.0
L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0/time
L2 data volume [GBytes] = 1.0E-09*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0
-
Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
number of cacheline loaded from the L2 to the L1 data cache and the writebacks from
the L1 data cache to the L2 cache. The group also outputs total data volume transfered between
L2 and L1. Note that this bandwidth also includes data transfers due to a write
allocate load on a store miss in L1 and cachelines transfered it the instruction
cache.

View File

@ -0,0 +1,32 @@
SHORT L2 cache miss rate/ratio
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 L2D_CACHE
PMC3 L2D_CACHE_REFILL
METRICS
Runtime (RDTSC) [s] time
Clock [MHz] 1.E-06*PMC1/time
CPI PMC1/PMC0
L2 request rate PMC2/PMC0
L2 miss rate PMC3/PMC0
L2 miss ratio PMC3/PMC2
LONG
Formulas:
L2 request rate = L2D_CACHE/INST_RETIRED
L2 miss rate = L2D_CACHE_REFILL/INST_RETIRED
L2 miss ratio = L2D_CACHE_REFILL/L2D_CACHE
-
This group measures the locality of your data accesses with regard to the
L2 cache. L2 request rate tells you how data intensive your code is
or how many data accesses you have on average per instruction.
The L2 miss rate gives a measure how often it was necessary to get
cache lines from memory. And finally L2 miss ratio tells you how many of your
memory references required a cache line to be loaded from a higher level.
While the data cache miss rate might be given by your algorithm you should
try to get data cache miss ratio as low as possible by increasing your cache reuse.

View File

@ -0,0 +1,38 @@
SHORT L3 cache bandwidth in MBytes/s
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 L2D_CACHE_REFILL
PMC3 L2D_CACHE_WB
PMC4 L2D_CACHE_ALLOCATE
METRICS
Runtime (RDTSC) [s] time
Clock [MHz] 1.E-06*PMC1/time
CPI PMC1/PMC0
L3 load bandwidth [MBytes/s] 1.0E-06*(PMC2-PMC4)*64.0/time
L3 load data volume [GBytes] 1.0E-09*(PMC2-PMC4)*64.0
L3 evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time
L3 evict data volume [GBytes] 1.0E-09*PMC3*64.0
L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3-PMC4)*64.0/time
L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3-PMC4)*64.0
LONG
Formulas:
CPI = CPU_CYCLES/INST_RETIRED
L3 load bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL-L2D_CACHE_ALLOCATE)*64.0/time
L3 load data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL-L2D_CACHE_ALLOCATE)*64.0
L3 evict bandwidth [MBytes/s] = 1.0E-06*L2D_CACHE_WB*64.0/time
L3 evict data volume [GBytes] = 1.0E-09*L2D_CACHE_WB*64.0
L3 bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL+L2D_CACHE_WB-L2D_CACHE_ALLOCATE))*64.0/time
L3 data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL+L2D_CACHE_WB-L2D_CACHE_ALLOCATE))*64.0
-
Profiling group to measure L2 <-> L3 cache bandwidth. The bandwidth is computed by the
number of cache lines loaded from the L3 to the L2 data cache and the writebacks from
the L2 data cache to the L3 cache. The group also outputs total data volume transfered between
L3 and L2. For streaming-stores, the cache lines are allocated in L2, consequently there
is no traffic between L3 and L2 in this case. But the L2D_CACHE_REFILL event counts these
allocated cache lines, that's why the value of L2D_CACHE_REFILL is reduced
by L2D_CACHE_ALLOCATE.

View File

@ -0,0 +1,32 @@
SHORT Main memory bandwidth in MBytes/s
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
MBOX0C0 MEMORY_READS
MBOX0C1 MEMORY_WRITES
MBOX1C0 MEMORY_READS
MBOX1C1 MEMORY_WRITES
METRICS
Runtime (RDTSC) [s] time
Clock [MHz] 1.E-06*PMC1/time
CPI PMC1/PMC0
Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0)*64.0/time
Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0)*64.0
Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1)*64.0/time
Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1)*64.0
Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1)*64.0/time
Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1)*64.0
LONG
Formulas:
Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MEMORY_READS))*64.0/runtime
Memory read data volume [GBytes] = 1.0E-09*(SUM(MEMORY_READS))*64.0
Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MEMORY_WRITES))*64.0/runtime
Memory write data volume [GBytes] = 1.0E-09*(SUM(MEMORY_WRITES))*64.0
Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MEMORY_READS)+SUM(MEMORY_WRITES))*64.0/runtime
Memory data volume [GBytes] = 1.0E-09*(SUM(MEMORY_READS)+SUM(MEMORY_WRITES))*64.0
-
Profiling group to measure memory bandwidth. It uses the performance monitoring
hardware of the memory controllers.

View File

@ -0,0 +1,44 @@
SHORT Information about speculative execution
EVENTSET
PMC0 INST_SPEC
PMC1 LD_SPEC
PMC2 ST_SPEC
PMC3 DP_SPEC
PMC4 VFP_SPEC
PMC5 ASE_SPEC
METRICS
Runtime (RDTSC) [s] time
Operations spec. executed PMC0
Load ops spec. executed PMC1
Store ops spec. executed PMC2
Integer data ops spec. executed PMC3
Scalar FP ops spec. executed PMC4
Vector FP ops spec. executed PMC5
Other ops spec. executed (PMC0-PMC1-PMC2-PMC3-PMC4-PMC5)
Load ops spec. ratio PMC1/PMC0
Store ops spec. ratio PMC2/PMC0
Integer data ops spec. ratio PMC3/PMC0
Scalar FP ops spec. ratio PMC4/PMC0
Vector FP ops spec. ratio PMC5/PMC0
Other ops spec. ratio (PMC0-PMC1-PMC2-PMC3-PMC4-PMC5)/PMC0
LONG
Formulas:
Load ops spec. ratio = LD_SPEC / INST_SPEC
Store ops spec. ratio = ST_SPEC / INST_SPEC
Integer data ops spec. ratio = DP_SPEC / INST_SPEC
Scalar FP ops spec. ratio = VFP_SPEC / INST_SPEC
Vector FP ops spec. ratio = ASE_SPEC / INST_SPEC
Other ops spec. ratio = (INST_SPEC-LD_SPEC-ST_SPEC-DP_SPEC-VFP_SPEC-ASE_SPEC) / INST_SPEC
-
This group gives information about the speculative execution of micro-ops.
It is currently unclear why Other ops spec. executed and ratio is negative
in some cases. Although the documentation contains an OP_RETIRED, there is no
equivalent OP_SPEC which could be a better reference in this group instead of
INST_SPEC.

View File

@ -0,0 +1,27 @@
SHORT L1 data TLB miss rate/ratio
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 L1D_TLB_REFILL_RD
PMC3 L1D_TLB_REFILL_WR
METRICS
Runtime (RDTSC) [s] time
Clock [MHz] 1.E-06*PMC1/time
CPI PMC1/PMC0
L1 DTLB load misses PMC2
L1 DTLB load miss rate PMC2/PMC0
L1 DTLB store misses PMC3
L1 DTLB store miss rate PMC3/PMC0
LONG
Formulas:
L1 DTLB load misses = L1D_TLB_REFILL_RD
L1 DTLB load miss rate = L1D_TLB_REFILL_RD / INST_RETIRED
L1 DTLB store misses = L1D_TLB_REFILL_WR
L1 DTLB store miss rate = L1D_TLB_REFILL_WR / INST_RETIRED
-
The DTLB load and store miss rates gives a measure how often a TLB miss occurred
per instruction.

View File

@ -0,0 +1,23 @@
SHORT L1 Instruction TLB miss rate/ratio
EVENTSET
PMC0 INST_RETIRED
PMC1 CPU_CYCLES
PMC2 L1I_TLB_REFILL
METRICS
Runtime (RDTSC) [s] time
Clock [MHz] 1.E-06*PMC1/time
CPI PMC1/PMC0
L1 ITLB misses PMC2
L1 ITLB miss rate PMC2/PMC0
LONG
Formulas:
L1 ITLB misses = L1I_TLB_REFILL
L1 ITLB miss rate = L1I_TLB_REFILL / INST_RETIRED
-
The ITLB miss rates gives a measure how often a TLB miss occurred
per instruction.

View File

@ -0,0 +1,29 @@
SHORT Branch prediction miss rate/ratio
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
PMC0 BR_INST_RETIRED_ANY
PMC1 BR_INST_RETIRED_MISPRED
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
CPI FIXC1/FIXC0
Branch rate PMC0/FIXC0
Branch misprediction rate PMC1/FIXC0
Branch misprediction ratio PMC1/PMC0
Instructions per branch FIXC0/PMC0
LONG
Formulas:
Branch rate = BR_INST_RETIRED_ANY/INSTR_RETIRED_ANY
Branch misprediction rate = BR_INST_RETIRED_MISPRED/INSTR_RETIRED_ANY
Branch misprediction ratio = BR_INST_RETIRED_MISPRED/BR_INST_RETIRED_ANY
Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ANY
-
The rates state how often on average a branch or a mispredicted branch occurred
per instruction retired in total. The branch misprediction ratio sets directly
into relation what ratio of all branch instruction where mispredicted.
Instructions per branch is 1/branch rate.

View File

@ -0,0 +1,20 @@
SHORT Load to store ratio
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
PMC0 L1D_CACHE_LD
PMC1 L1D_CACHE_ST
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
CPI FIXC1/FIXC0
Load to store ratio PMC0/PMC1
LONG
Formulas:
Load to store ratio = L1D_CACHE_LD/L1D_CACHE_ST
-
This is a simple metric to determine your load to store ratio.

View File

@ -0,0 +1,25 @@
SHORT Double Precision MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
PMC0 SIMD_COMP_INST_RETIRED_PACKED_DOUBLE
PMC1 SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
CPI FIXC1/FIXC0
DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1)/time
Packed [MUOPS/s] 1.0E-06*PMC0/time
Scalar [MUOPS/s] 1.0E-06*PMC1/time
LONG
Formulas:
DP [MFLOP/s] = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_DOUBLE*2.0+SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE)/runtime
Packed [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_PACKED_DOUBLE/runtime
Scalar [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE/runtime
--
Double Precision [MFLOP/s] Double Precision MFLOP/s

View File

@ -0,0 +1,24 @@
SHORT Single Precision MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
PMC0 SIMD_COMP_INST_RETIRED_PACKED_SINGLE
PMC1 SIMD_COMP_INST_RETIRED_SCALAR_SINGLE
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
CPI FIXC1/FIXC0
SP [MFLOP/s] (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
Packed [MUOPS/s] 1.0E-06*(PMC0)/time
Scalar [MUOPS/s] 1.0E-06*PMC1/time
LONG
Formulas:
SP [MFLOP/s] = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_DOUBLE*4.0+SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE)/runtime
Packed [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_PACKED_SINGLE/runtime
Scalar [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_SCALAR_SINGLE/runtime
--
Single Precision MFLOP/s Double Precision MFLOP/s

View File

@ -0,0 +1,19 @@
SHORT X87 MFLOP/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
PMC0 X87_COMP_OPS_EXE_ANY_AR
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
CPI FIXC1/FIXC0
X87 [MFLOP/s] 1.0E-06*PMC0/time
LONG
Formulas:
X87 [MFLOP/s] = 1.0E-06*X87_COMP_OPS_EXE_ANY_AR/runtime
--
The MFLOP/s made with X87 instructions

View File

@ -0,0 +1,21 @@
SHORT Main memory bandwidth in MBytes/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
PMC0 BUS_TRANS_MEM_THIS_CORE_THIS_A
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
CPI FIXC1/FIXC0
Memory bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
Memory data volume [GBytes] 1.0E-09*PMC0*64.0
LONG
Formulas:
Memory bandwidth [MBytes/s] = 1.0E-06*BUS_TRANS_MEM_THIS_CORE_THIS_A*64/time
Memory data volume [GBytes] = 1.0E-09*BUS_TRANS_MEM_THIS_CORE_THIS_A*64.0
-
Profiling group to measure memory bandwidth drawn by this core.

View File

@ -0,0 +1,21 @@
SHORT TLB miss rate
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
PMC0 DATA_TLB_MISSES_DTLB_MISS
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
CPI FIXC1/FIXC0
DTLB misses PMC0
DTLB miss rate PMC0/FIXC0
LONG
Formulas:
DTLB misses = DATA_TLB_MISSES_DTLB_MISS
DTLB miss rate = DATA_TLB_MISSES_DTLB_MISS/INSTR_RETIRED_ANY
--
The DTLB miss rate gives a measure how often a TLB miss occurred per instruction.

View File

@ -0,0 +1,31 @@
SHORT Branch prediction miss rate/ratio
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 BR_INST_RETIRED_ALL_BRANCHES
PMC1 BR_MISP_RETIRED_ALL_BRANCHES
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Branch rate PMC0/FIXC0
Branch misprediction rate PMC1/FIXC0
Branch misprediction ratio PMC1/PMC0
Instructions per branch FIXC0/PMC0
LONG
Formulas:
Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
-
The rates state how often on average a branch or a mispredicted branch occurred
per instruction retired in total. The branch misprediction ratio sets directly
into relation what ratio of all branch instruction where mispredicted.
Instructions per branch is 1/branch rate.

View File

@ -0,0 +1,26 @@
SHORT Power and Energy consumption
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PWR0 PWR_PKG_ENERGY
UBOXFIX UNCORE_CLOCK
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
Uncore Clock [MHz] 1.E-06*UBOXFIX/time
CPI FIXC1/FIXC0
Energy [J] PWR0
Power [W] PWR0/time
LONG
Formulas:
Power = PWR_PKG_ENERGY / time
Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
-
Broadwell implements the new RAPL interface. This interface enables to
monitor the consumed energy on the package (socket) level.

View File

@ -0,0 +1,38 @@
SHORT Cycle Activities
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING
PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING
PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING
PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Cycles without execution [%] (PMC3/FIXC1)*100
Cycles without execution due to L1D [%] (PMC2/FIXC1)*100
Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100
LONG
Formulas:
Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
Cycles without execution due to memory loads [%] = CYCLE_ACTIVITY_CYCLES_LDM_PENDING/CPU_CLK_UNHALTED_CORE*100
--
This performance group measures the cycles while waiting for data from the cache
and memory hierarchy.
CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on
any execution port.
CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is
outstanding.
CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is
outstanding.
CYCLE_ACTIVITY_CYCLES_LDM_PENDING: Cycles while memory subsystem has an
outstanding load.

View File

@ -0,0 +1,45 @@
SHORT Cycle Activities (Stalls)
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING
PMC1 CYCLE_ACTIVITY_STALLS_LDM_PENDING
PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING
PMC3 CYCLE_ACTIVITY_STALLS_TOTAL
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Total execution stalls PMC3
Stalls caused by L1D misses [%] (PMC2/PMC3)*100
Stalls caused by L2 misses [%] (PMC0/PMC3)*100
Stalls caused by memory loads [%] (PMC1/PMC3)*100
Execution stall rate [%] (PMC3/FIXC1)*100
Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100
Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100
LONG
Formulas:
Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
Stalls caused by memory loads [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100
Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100
Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100
Stalls caused by memory loads rate [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CPU_CLK_UNHALTED_CORE)*100
--
This performance group measures the stalls caused by data traffic in the cache
hierarchy.
CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls.
CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand
load is outstanding.
CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand
load is outstanding.
CYCLE_ACTIVITY_STALLS_LDM_PENDING: Execution stalls while memory subsystem has
an outstanding load.

View File

@ -0,0 +1,22 @@
SHORT Load to store ratio
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 MEM_UOPS_RETIRED_LOADS_ALL
PMC1 MEM_UOPS_RETIRED_STORES_ALL
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Load to store ratio PMC0/PMC1
LONG
Formulas:
Load to store ratio = MEM_UOPS_RETIRED_LOADS_ALL/MEM_UOPS_RETIRED_STORES_ALL
-
This is a metric to determine your load to store ratio.

View File

@ -0,0 +1,24 @@
SHORT Divide unit information
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0:EDGEDETECT ARITH_FPU_DIV_ACTIVE
PMC1 ARITH_FPU_DIV_ACTIVE
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Number of divide ops PMC0:EDGEDETECT
Avg. divide unit usage duration PMC1/PMC0:EDGEDETECT
LONG
Formulas:
Number of divide ops = ARITH_FPU_DIV_ACTIVE:EDGEDETECT
Avg. divide unit usage duration = ARITH_FPU_DIV_ACTIVE/ARITH_FPU_DIV_ACTIVE:EDGEDETECT
-
This performance group measures the average latency of divide operations

Some files were not shown because too many files have changed in this diff Show More