diff --git a/collectors/likwid/bstrlib.h b/collectors/likwid/bstrlib.h new file mode 100644 index 0000000..02a836e --- /dev/null +++ b/collectors/likwid/bstrlib.h @@ -0,0 +1,301 @@ +/* + * ======================================================================================= + * This source file is part of the bstring string library. This code was + * written by Paul Hsieh in 2002-2008, and is covered by the BSD open source + * license and the GPL. Refer to the accompanying documentation for details + * on usage and license. + */ +/* + * bstrlib.c + * + * This file is the core module for implementing the bstring functions. + */ + +#ifndef BSTRLIB_INCLUDE +#define BSTRLIB_INCLUDE + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include + +#if !defined (BSTRLIB_VSNP_OK) && !defined (BSTRLIB_NOVSNP) +# if defined (__TURBOC__) && !defined (__BORLANDC__) +# define BSTRLIB_NOVSNP +# endif +#endif + +#define BSTR_ERR (-1) +#define BSTR_OK (0) +#define BSTR_BS_BUFF_LENGTH_GET (0) + +typedef struct tagbstring * bstring; +typedef const struct tagbstring * const_bstring; + +/* Copy functions */ +#define cstr2bstr bfromcstr +extern bstring bfromcstr (const char * str); +extern bstring bfromcstralloc (int mlen, const char * str); +extern bstring blk2bstr (const void * blk, int len); +extern char * bstr2cstr (const_bstring s, char z); +extern int bcstrfree (char * s); +extern bstring bstrcpy (const_bstring b1); +extern int bassign (bstring a, const_bstring b); +extern int bassignmidstr (bstring a, const_bstring b, int left, int len); +extern int bassigncstr (bstring a, const char * str); +extern int bassignblk (bstring a, const void * s, int len); + +/* Destroy function */ +extern int bdestroy (bstring b); + +/* Space allocation hinting functions */ +extern int balloc (bstring s, int len); +extern int ballocmin (bstring b, int len); + +/* Substring extraction */ +extern bstring bmidstr (const_bstring b, int left, int len); + +/* Various standard manipulations */ +extern int bconcat (bstring b0, const_bstring b1); +extern int bconchar (bstring b0, char c); +extern int bcatcstr (bstring b, const char * s); +extern int bcatblk (bstring b, const void * s, int len); +extern int binsert (bstring s1, int pos, const_bstring s2, unsigned char fill); +extern int binsertch (bstring s1, int pos, int len, unsigned char fill); +extern int breplace (bstring b1, int pos, int len, const_bstring b2, unsigned char fill); +extern int bdelete (bstring s1, int pos, int len); +extern int bsetstr (bstring b0, int pos, const_bstring b1, unsigned char fill); +extern int btrunc (bstring b, int n); + +/* Scan/search functions */ +extern int bstricmp (const_bstring b0, const_bstring b1); +extern int bstrnicmp (const_bstring b0, const_bstring b1, int n); +extern int biseqcaseless (const_bstring b0, const_bstring b1); +extern int bisstemeqcaselessblk (const_bstring b0, const void * blk, int len); +extern int biseq (const_bstring b0, const_bstring b1); +extern int bisstemeqblk (const_bstring b0, const void * blk, int len); +extern int biseqcstr (const_bstring b, const char * s); +extern int biseqcstrcaseless (const_bstring b, const char * s); +extern int bstrcmp (const_bstring b0, const_bstring b1); +extern int bstrncmp (const_bstring b0, const_bstring b1, int n); +extern int binstr (const_bstring s1, int pos, const_bstring s2); +extern int binstrr (const_bstring s1, int pos, const_bstring s2); +extern int binstrcaseless (const_bstring s1, int pos, const_bstring s2); +extern int binstrrcaseless (const_bstring s1, int pos, const_bstring s2); +extern int bstrchrp (const_bstring b, int c, int pos); +extern int bstrrchrp (const_bstring b, int c, int pos); +#define bstrchr(b,c) bstrchrp ((b), (c), 0) +#define bstrrchr(b,c) bstrrchrp ((b), (c), blength(b)-1) +extern int binchr (const_bstring b0, int pos, const_bstring b1); +extern int binchrr (const_bstring b0, int pos, const_bstring b1); +extern int bninchr (const_bstring b0, int pos, const_bstring b1); +extern int bninchrr (const_bstring b0, int pos, const_bstring b1); +extern int bfindreplace (bstring b, const_bstring find, const_bstring repl, int pos); +extern int bfindreplacecaseless (bstring b, const_bstring find, const_bstring repl, int pos); + +/* List of string container functions */ +struct bstrList { + int qty, mlen; + bstring * entry; +}; +extern struct bstrList * bstrListCreate (void); +extern int bstrListDestroy (struct bstrList * sl); +extern int bstrListAlloc (struct bstrList * sl, int msz); +extern int bstrListAllocMin (struct bstrList * sl, int msz); + +/* String split and join functions */ +extern struct bstrList * bsplit (const_bstring str, unsigned char splitChar); +extern struct bstrList * bsplits (const_bstring str, const_bstring splitStr); +extern struct bstrList * bsplitstr (const_bstring str, const_bstring splitStr); +extern bstring bjoin (const struct bstrList * bl, const_bstring sep); +extern int bsplitcb (const_bstring str, unsigned char splitChar, int pos, + int (* cb) (void * parm, int ofs, int len), void * parm); +extern int bsplitscb (const_bstring str, const_bstring splitStr, int pos, + int (* cb) (void * parm, int ofs, int len), void * parm); +extern int bsplitstrcb (const_bstring str, const_bstring splitStr, int pos, + int (* cb) (void * parm, int ofs, int len), void * parm); + +/* Miscellaneous functions */ +extern int bpattern (bstring b, int len); +extern int btoupper (bstring b); +extern int btolower (bstring b); +extern int bltrimws (bstring b); +extern int brtrimws (bstring b); +extern int btrimws (bstring b); + +#if !defined (BSTRLIB_NOVSNP) +extern bstring bformat (const char * fmt, ...); +extern int bformata (bstring b, const char * fmt, ...); +extern int bassignformat (bstring b, const char * fmt, ...); +extern int bvcformata (bstring b, int count, const char * fmt, va_list arglist); + +#define bvformata(ret, b, fmt, lastarg) { \ +bstring bstrtmp_b = (b); \ +const char * bstrtmp_fmt = (fmt); \ +int bstrtmp_r = BSTR_ERR, bstrtmp_sz = 16; \ + for (;;) { \ + va_list bstrtmp_arglist; \ + va_start (bstrtmp_arglist, lastarg); \ + bstrtmp_r = bvcformata (bstrtmp_b, bstrtmp_sz, bstrtmp_fmt, bstrtmp_arglist); \ + va_end (bstrtmp_arglist); \ + if (bstrtmp_r >= 0) { /* Everything went ok */ \ + bstrtmp_r = BSTR_OK; \ + break; \ + } else if (-bstrtmp_r <= bstrtmp_sz) { /* A real error? */ \ + bstrtmp_r = BSTR_ERR; \ + break; \ + } \ + bstrtmp_sz = -bstrtmp_r; /* Doubled or target size */ \ + } \ + ret = bstrtmp_r; \ +} + +#endif + +typedef int (*bNgetc) (void *parm); +typedef size_t (* bNread) (void *buff, size_t elsize, size_t nelem, void *parm); + +/* Input functions */ +extern bstring bgets (bNgetc getcPtr, void * parm, char terminator); +extern bstring bread (bNread readPtr, void * parm); +extern int bgetsa (bstring b, bNgetc getcPtr, void * parm, char terminator); +extern int bassigngets (bstring b, bNgetc getcPtr, void * parm, char terminator); +extern int breada (bstring b, bNread readPtr, void * parm); + +/* Stream functions */ +extern struct bStream * bsopen (bNread readPtr, void * parm); +extern void * bsclose (struct bStream * s); +extern int bsbufflength (struct bStream * s, int sz); +extern int bsreadln (bstring b, struct bStream * s, char terminator); +extern int bsreadlns (bstring r, struct bStream * s, const_bstring term); +extern int bsread (bstring b, struct bStream * s, int n); +extern int bsreadlna (bstring b, struct bStream * s, char terminator); +extern int bsreadlnsa (bstring r, struct bStream * s, const_bstring term); +extern int bsreada (bstring b, struct bStream * s, int n); +extern int bsunread (struct bStream * s, const_bstring b); +extern int bspeek (bstring r, const struct bStream * s); +extern int bssplitscb (struct bStream * s, const_bstring splitStr, + int (* cb) (void * parm, int ofs, const_bstring entry), void * parm); +extern int bssplitstrcb (struct bStream * s, const_bstring splitStr, + int (* cb) (void * parm, int ofs, const_bstring entry), void * parm); +extern int bseof (const struct bStream * s); + +struct tagbstring { + int mlen; + int slen; + unsigned char * data; +}; + +/* Accessor macros */ +#define blengthe(b, e) (((b) == (void *)0 || (b)->slen < 0) ? (int)(e) : ((b)->slen)) +#define blength(b) (blengthe ((b), 0)) +#define bdataofse(b, o, e) (((b) == (void *)0 || (b)->data == (void*)0) ? (char *)(e) : ((char *)(b)->data) + (o)) +#define bdataofs(b, o) (bdataofse ((b), (o), (void *)0)) +#define bdatae(b, e) (bdataofse (b, 0, e)) +#define bdata(b) (bdataofs (b, 0)) +#define bchare(b, p, e) ((((unsigned)(p)) < (unsigned)blength(b)) ? ((b)->data[(p)]) : (e)) +#define bchar(b, p) bchare ((b), (p), '\0') + +/* Static constant string initialization macro */ +#define bsStaticMlen(q,m) {(m), (int) sizeof(q)-1, (unsigned char *) ("" q "")} +#if defined(_MSC_VER) +# define bsStatic(q) bsStaticMlen(q,-32) +#endif +#ifndef bsStatic +# define bsStatic(q) bsStaticMlen(q,-__LINE__) +#endif + +/* Static constant block parameter pair */ +#define bsStaticBlkParms(q) ((void *)("" q "")), ((int) sizeof(q)-1) + +/* Reference building macros */ +#define cstr2tbstr btfromcstr +#define btfromcstr(t,s) { \ + (t).data = (unsigned char *) (s); \ + (t).slen = ((t).data) ? ((int) (strlen) ((char *)(t).data)) : 0; \ + (t).mlen = -1; \ +} +#define blk2tbstr(t,s,l) { \ + (t).data = (unsigned char *) (s); \ + (t).slen = l; \ + (t).mlen = -1; \ +} +#define btfromblk(t,s,l) blk2tbstr(t,s,l) +#define bmid2tbstr(t,b,p,l) { \ + const_bstring bstrtmp_s = (b); \ + if (bstrtmp_s && bstrtmp_s->data && bstrtmp_s->slen >= 0) { \ + int bstrtmp_left = (p); \ + int bstrtmp_len = (l); \ + if (bstrtmp_left < 0) { \ + bstrtmp_len += bstrtmp_left; \ + bstrtmp_left = 0; \ + } \ + if (bstrtmp_len > bstrtmp_s->slen - bstrtmp_left) \ + bstrtmp_len = bstrtmp_s->slen - bstrtmp_left; \ + if (bstrtmp_len <= 0) { \ + (t).data = (unsigned char *)""; \ + (t).slen = 0; \ + } else { \ + (t).data = bstrtmp_s->data + bstrtmp_left; \ + (t).slen = bstrtmp_len; \ + } \ + } else { \ + (t).data = (unsigned char *)""; \ + (t).slen = 0; \ + } \ + (t).mlen = -__LINE__; \ +} +#define btfromblkltrimws(t,s,l) { \ + int bstrtmp_idx = 0, bstrtmp_len = (l); \ + unsigned char * bstrtmp_s = (s); \ + if (bstrtmp_s && bstrtmp_len >= 0) { \ + for (; bstrtmp_idx < bstrtmp_len; bstrtmp_idx++) { \ + if (!isspace (bstrtmp_s[bstrtmp_idx])) break; \ + } \ + } \ + (t).data = bstrtmp_s + bstrtmp_idx; \ + (t).slen = bstrtmp_len - bstrtmp_idx; \ + (t).mlen = -__LINE__; \ +} +#define btfromblkrtrimws(t,s,l) { \ + int bstrtmp_len = (l) - 1; \ + unsigned char * bstrtmp_s = (s); \ + if (bstrtmp_s && bstrtmp_len >= 0) { \ + for (; bstrtmp_len >= 0; bstrtmp_len--) { \ + if (!isspace (bstrtmp_s[bstrtmp_len])) break; \ + } \ + } \ + (t).data = bstrtmp_s; \ + (t).slen = bstrtmp_len + 1; \ + (t).mlen = -__LINE__; \ +} +#define btfromblktrimws(t,s,l) { \ + int bstrtmp_idx = 0, bstrtmp_len = (l) - 1; \ + unsigned char * bstrtmp_s = (s); \ + if (bstrtmp_s && bstrtmp_len >= 0) { \ + for (; bstrtmp_idx <= bstrtmp_len; bstrtmp_idx++) { \ + if (!isspace (bstrtmp_s[bstrtmp_idx])) break; \ + } \ + for (; bstrtmp_len >= bstrtmp_idx; bstrtmp_len--) { \ + if (!isspace (bstrtmp_s[bstrtmp_len])) break; \ + } \ + } \ + (t).data = bstrtmp_s + bstrtmp_idx; \ + (t).slen = bstrtmp_len + 1 - bstrtmp_idx; \ + (t).mlen = -__LINE__; \ +} + +/* Write protection macros */ +#define bwriteprotect(t) { if ((t).mlen >= 0) (t).mlen = -1; } +#define bwriteallow(t) { if ((t).mlen == -1) (t).mlen = (t).slen + ((t).slen == 0); } +#define biswriteprotected(t) ((t).mlen <= 0) + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/collectors/likwid/groups/CLX/BRANCH.txt b/collectors/likwid/groups/CLX/BRANCH.txt new file mode 100644 index 0000000..b8d41b2 --- /dev/null +++ b/collectors/likwid/groups/CLX/BRANCH.txt @@ -0,0 +1,31 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 BR_INST_RETIRED_ALL_BRANCHES +PMC1 BR_MISP_RETIRED_ALL_BRANCHES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Branch rate PMC0/FIXC0 +Branch misprediction rate PMC1/FIXC0 +Branch misprediction ratio PMC1/PMC0 +Instructions per branch FIXC0/PMC0 + +LONG +Formulas: +Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES +Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/CLX/CACHES.txt b/collectors/likwid/groups/CLX/CACHES.txt new file mode 100644 index 0000000..c700dd4 --- /dev/null +++ b/collectors/likwid/groups/CLX/CACHES.txt @@ -0,0 +1,143 @@ +SHORT Cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPLACEMENT +PMC1 L1D_M_EVICT +PMC2 L2_LINES_IN_ALL +PMC3 L2_TRANS_L2_WB +CBOX0C1 LLC_VICTIMS_M_STATE +CBOX1C1 LLC_VICTIMS_M_STATE +CBOX2C1 LLC_VICTIMS_M_STATE +CBOX3C1 LLC_VICTIMS_M_STATE +CBOX4C1 LLC_VICTIMS_M_STATE +CBOX5C1 LLC_VICTIMS_M_STATE +CBOX6C1 LLC_VICTIMS_M_STATE +CBOX7C1 LLC_VICTIMS_M_STATE +CBOX8C1 LLC_VICTIMS_M_STATE +CBOX9C1 LLC_VICTIMS_M_STATE +CBOX10C1 LLC_VICTIMS_M_STATE +CBOX11C1 LLC_VICTIMS_M_STATE +CBOX12C1 LLC_VICTIMS_M_STATE +CBOX13C1 LLC_VICTIMS_M_STATE +CBOX14C1 LLC_VICTIMS_M_STATE +CBOX15C1 LLC_VICTIMS_M_STATE +CBOX16C1 LLC_VICTIMS_M_STATE +CBOX17C1 LLC_VICTIMS_M_STATE +CBOX18C1 LLC_VICTIMS_M_STATE +CBOX19C1 LLC_VICTIMS_M_STATE +CBOX20C1 LLC_VICTIMS_M_STATE +CBOX21C1 LLC_VICTIMS_M_STATE +CBOX22C1 LLC_VICTIMS_M_STATE +CBOX23C1 LLC_VICTIMS_M_STATE +CBOX24C1 LLC_VICTIMS_M_STATE +CBOX25C1 LLC_VICTIMS_M_STATE +CBOX26C1 LLC_VICTIMS_M_STATE +CBOX27C1 LLC_VICTIMS_M_STATE +CBOX0C0 LLC_LOOKUP_DATA_READ +CBOX1C0 LLC_LOOKUP_DATA_READ +CBOX2C0 LLC_LOOKUP_DATA_READ +CBOX3C0 LLC_LOOKUP_DATA_READ +CBOX4C0 LLC_LOOKUP_DATA_READ +CBOX5C0 LLC_LOOKUP_DATA_READ +CBOX6C0 LLC_LOOKUP_DATA_READ +CBOX7C0 LLC_LOOKUP_DATA_READ +CBOX8C0 LLC_LOOKUP_DATA_READ +CBOX9C0 LLC_LOOKUP_DATA_READ +CBOX10C0 LLC_LOOKUP_DATA_READ +CBOX11C0 LLC_LOOKUP_DATA_READ +CBOX12C0 LLC_LOOKUP_DATA_READ +CBOX13C0 LLC_LOOKUP_DATA_READ +CBOX14C0 LLC_LOOKUP_DATA_READ +CBOX15C0 LLC_LOOKUP_DATA_READ +CBOX16C0 LLC_LOOKUP_DATA_READ +CBOX17C0 LLC_LOOKUP_DATA_READ +CBOX18C0 LLC_LOOKUP_DATA_READ +CBOX19C0 LLC_LOOKUP_DATA_READ +CBOX20C0 LLC_LOOKUP_DATA_READ +CBOX21C0 LLC_LOOKUP_DATA_READ +CBOX22C0 LLC_LOOKUP_DATA_READ +CBOX23C0 LLC_LOOKUP_DATA_READ +CBOX24C0 LLC_LOOKUP_DATA_READ +CBOX25C0 LLC_LOOKUP_DATA_READ +CBOX26C0 LLC_LOOKUP_DATA_READ +CBOX27C0 LLC_LOOKUP_DATA_READ +MBOX0C0 CAS_COUNT_RD +MBOX0C1 CAS_COUNT_WR +MBOX1C0 CAS_COUNT_RD +MBOX1C1 CAS_COUNT_WR +MBOX2C0 CAS_COUNT_RD +MBOX2C1 CAS_COUNT_WR +MBOX3C0 CAS_COUNT_RD +MBOX3C1 CAS_COUNT_WR +MBOX4C0 CAS_COUNT_RD +MBOX4C1 CAS_COUNT_WR +MBOX5C0 CAS_COUNT_RD +MBOX5C1 CAS_COUNT_WR + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0 +L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 +L3 to L2 load bandwidth [MBytes/s] 1.0E-06*PMC2*64.0/time +L3 to L2 load data volume [GBytes] 1.0E-09*PMC2*64.0 +L2 to L3 evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time +L2 to L3 evict data volume [GBytes] 1.0E-09*PMC3*64.0 +L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time +L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0 +System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0)*64.0/time +System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0)*64.0 +L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64/time +L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64 +L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64.0/time +L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64.0 +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 + +LONG +Formulas: +L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time +L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64 +L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time +L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64 +L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time +L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64 +L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time +L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64 +L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time +L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64 +L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time +L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64 +System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ))*64/time +System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ))*64 +L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M_STATE))*64/time +L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M_STATE))*64 +L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M_STATE))*64/time +L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M_STATE))*64 +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time +Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time +Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time +Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0 +- +Group to measure cache transfers between L1 and Memory. Please notice that the +L3 to/from system metrics contain any traffic to the system (memory, +Intel QPI, etc.) but don't seem to handle anything because commonly memory read +bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth. + diff --git a/collectors/likwid/groups/CLX/CLOCK.txt b/collectors/likwid/groups/CLX/CLOCK.txt new file mode 100644 index 0000000..b81bee6 --- /dev/null +++ b/collectors/likwid/groups/CLX/CLOCK.txt @@ -0,0 +1,26 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY +UBOXFIX UNCORE_CLOCK + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +Uncore Clock [MHz] 1.E-06*UBOXFIX/time +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time +- +Broadwell implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) level. + diff --git a/collectors/likwid/groups/CLX/CYCLE_ACTIVITY.txt b/collectors/likwid/groups/CLX/CYCLE_ACTIVITY.txt new file mode 100644 index 0000000..c432a44 --- /dev/null +++ b/collectors/likwid/groups/CLX/CYCLE_ACTIVITY.txt @@ -0,0 +1,38 @@ +SHORT Cycle Activities + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING +PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING +PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING +PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Cycles without execution [%] (PMC3/FIXC1)*100 +Cycles without execution due to L1D [%] (PMC2/FIXC1)*100 +Cycles without execution due to L2 [%] (PMC0/FIXC1)*100 +Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100 + +LONG +Formulas: +Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100 +Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100 +Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100 +Cycles without execution due to memory loads [%] = CYCLE_ACTIVITY_CYCLES_LDM_PENDING/CPU_CLK_UNHALTED_CORE*100 +-- +This performance group measures the cycles while waiting for data from the cache +and memory hierarchy. +CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on +any execution port. +CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is +outstanding. +CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is +outstanding. +CYCLE_ACTIVITY_CYCLES_LDM_PENDING: Cycles while memory subsystem has an +outstanding load. diff --git a/collectors/likwid/groups/CLX/CYCLE_STALLS.txt b/collectors/likwid/groups/CLX/CYCLE_STALLS.txt new file mode 100644 index 0000000..795aeb9 --- /dev/null +++ b/collectors/likwid/groups/CLX/CYCLE_STALLS.txt @@ -0,0 +1,45 @@ +SHORT Cycle Activities (Stalls) + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING +PMC1 CYCLE_ACTIVITY_STALLS_LDM_PENDING +PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING +PMC3 CYCLE_ACTIVITY_STALLS_TOTAL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Total execution stalls PMC3 +Stalls caused by L1D misses [%] (PMC2/PMC3)*100 +Stalls caused by L2 misses [%] (PMC0/PMC3)*100 +Stalls caused by memory loads [%] (PMC1/PMC3)*100 +Execution stall rate [%] (PMC3/FIXC1)*100 +Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100 +Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100 +Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100 + +LONG +Formulas: +Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL +Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Stalls caused by memory loads [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by memory loads rate [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CPU_CLK_UNHALTED_CORE)*100 +-- +This performance group measures the stalls caused by data traffic in the cache +hierarchy. +CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls. +CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand +load is outstanding. +CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand +load is outstanding. +CYCLE_ACTIVITY_STALLS_LDM_PENDING: Execution stalls while memory subsystem has +an outstanding load. diff --git a/collectors/likwid/groups/CLX/DATA.txt b/collectors/likwid/groups/CLX/DATA.txt new file mode 100644 index 0000000..4e6e938 --- /dev/null +++ b/collectors/likwid/groups/CLX/DATA.txt @@ -0,0 +1,22 @@ +SHORT Load to store ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_INST_RETIRED_ALL_LOADS +PMC1 MEM_INST_RETIRED_ALL_STORES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Load to store ratio PMC0/PMC1 + +LONG +Formulas: +Load to store ratio = MEM_INST_RETIRED_ALL_LOADS/MEM_INST_RETIRED_ALL_STORES +- +This is a metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/CLX/DIVIDE.txt b/collectors/likwid/groups/CLX/DIVIDE.txt new file mode 100644 index 0000000..2c6222d --- /dev/null +++ b/collectors/likwid/groups/CLX/DIVIDE.txt @@ -0,0 +1,24 @@ +SHORT Divide unit information + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ARITH_DIVIDER_COUNT +PMC1 ARITH_DIVIDER_ACTIVE + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Number of divide ops PMC0 +Avg. divide unit usage duration PMC1/PMC0 + +LONG +Formulas: +Number of divide ops = ARITH_DIVIDER_COUNT +Avg. divide unit usage duration = ARITH_DIVIDER_ACTIVE/ARITH_DIVIDER_COUNT +-- +This performance group measures the average latency of divide operations diff --git a/collectors/likwid/groups/CLX/ENERGY.txt b/collectors/likwid/groups/CLX/ENERGY.txt new file mode 100644 index 0000000..fe7829f --- /dev/null +++ b/collectors/likwid/groups/CLX/ENERGY.txt @@ -0,0 +1,35 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +TMP0 TEMP_CORE +PWR0 PWR_PKG_ENERGY +PWR1 PWR_PP0_ENERGY +PWR3 PWR_DRAM_ENERGY + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Temperature [C] TMP0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy PP0 [J] PWR1 +Power PP0 [W] PWR1/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Power PP0 = PWR_PP0_ENERGY / time +Power DRAM = PWR_DRAM_ENERGY / time +- +Broadwell implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) and DRAM level. + diff --git a/collectors/likwid/groups/CLX/FLOPS_AVX.txt b/collectors/likwid/groups/CLX/FLOPS_AVX.txt new file mode 100644 index 0000000..e44a913 --- /dev/null +++ b/collectors/likwid/groups/CLX/FLOPS_AVX.txt @@ -0,0 +1,25 @@ +SHORT Packed AVX MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE +PMC1 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE +PMC2 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE +PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Packed SP [MFLOP/s] 1.0E-06*(PMC0*8.0+PMC2*16.0)/time +Packed DP [MFLOP/s] 1.0E-06*(PMC1*4.0+PMC3*8.0)/time + +LONG +Formulas: +Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime +Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime +- +Packed 32b AVX FLOPs rates. diff --git a/collectors/likwid/groups/CLX/FLOPS_DP.txt b/collectors/likwid/groups/CLX/FLOPS_DP.txt new file mode 100644 index 0000000..7d6af79 --- /dev/null +++ b/collectors/likwid/groups/CLX/FLOPS_DP.txt @@ -0,0 +1,34 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE +PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time +AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time +AVX512 DP [MFLOP/s] 1.0E-06*(PMC3*8.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3) + +LONG +Formulas: +DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime +AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime +AVX512 DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime +Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE) +- +SSE scalar and packed double precision FLOP rates. + diff --git a/collectors/likwid/groups/CLX/FLOPS_SP.txt b/collectors/likwid/groups/CLX/FLOPS_SP.txt new file mode 100644 index 0000000..39fb08d --- /dev/null +++ b/collectors/likwid/groups/CLX/FLOPS_SP.txt @@ -0,0 +1,34 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE +PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time +AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time +AVX512 SP [MFLOP/s] 1.0E-06*(PMC3*16.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3) + +LONG +Formulas: +SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime +AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime +AVX512 SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime +Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE) +- +SSE scalar and packed single precision FLOP rates. + diff --git a/collectors/likwid/groups/CLX/L2.txt b/collectors/likwid/groups/CLX/L2.txt new file mode 100644 index 0000000..1a92a95 --- /dev/null +++ b/collectors/likwid/groups/CLX/L2.txt @@ -0,0 +1,38 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPLACEMENT +PMC1 L1D_M_EVICT +PMC2 ICACHE_64B_IFTAG_MISS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2D load data volume [GBytes] 1.0E-09*PMC0*64.0 +L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time +L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0 + +LONG +Formulas: +L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time +L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0 +L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time +L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0 +L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_64B_IFTAG_MISS)*64/time +L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_64B_IFTAG_MISS)*64 +- +Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L1 and the number of modified cache lines +evicted from the L1. The group also output total data volume transferred between +L2 and L1. Note that this bandwidth also includes data transfers due to a write +allocate load on a store miss in L1 and traffic caused by misses in the +L1 instruction cache. + diff --git a/collectors/likwid/groups/CLX/L2CACHE.txt b/collectors/likwid/groups/CLX/L2CACHE.txt new file mode 100644 index 0000000..9b5dd4b --- /dev/null +++ b/collectors/likwid/groups/CLX/L2CACHE.txt @@ -0,0 +1,34 @@ +SHORT L2 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_TRANS_ALL_REQUESTS +PMC1 L2_RQSTS_MISS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 request rate PMC0/FIXC0 +L2 miss rate PMC1/FIXC0 +L2 miss ratio PMC1/PMC0 + +LONG +Formulas: +L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY +L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY +L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS +- +This group measures the locality of your data accesses with regard to the +L2 cache. L2 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L2 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L2 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the# data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/CLX/L3.txt b/collectors/likwid/groups/CLX/L3.txt new file mode 100644 index 0000000..98d1d9e --- /dev/null +++ b/collectors/likwid/groups/CLX/L3.txt @@ -0,0 +1,36 @@ +SHORT L3 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_LINES_IN_ALL +PMC1 L2_TRANS_L2_WB + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L3 load data volume [GBytes] 1.0E-09*PMC0*64.0 +L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 + +LONG +Formulas: +L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time +L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0 +L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time +L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0 +L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time +L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64 +- +Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L2 and the number of modified cache lines +evicted from the L2. This group also output data volume transferred between the +L3 and measured cores L2 caches. Note that this bandwidth also includes data +transfers due to a write allocate load on a store miss in L2. + diff --git a/collectors/likwid/groups/CLX/L3CACHE.txt b/collectors/likwid/groups/CLX/L3CACHE.txt new file mode 100644 index 0000000..bc664d1 --- /dev/null +++ b/collectors/likwid/groups/CLX/L3CACHE.txt @@ -0,0 +1,35 @@ +SHORT L3 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_LOAD_RETIRED_L3_HIT +PMC1 MEM_LOAD_RETIRED_L3_MISS +PMC2 UOPS_RETIRED_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 request rate (PMC0+PMC1)/PMC2 +L3 miss rate PMC1/PMC2 +L3 miss ratio PMC1/(PMC0+PMC1) + +LONG +Formulas: +L3 request rate = (MEM_LOAD_RETIRED_L3_HIT+MEM_LOAD_RETIRED_L3_MISS)/UOPS_RETIRED_ALL +L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL +L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/(MEM_LOAD_RETIRED_L3_HIT+MEM_LOAD_RETIRED_L3_MISS) +- +This group measures the locality of your data accesses with regard to the +L3 cache. L3 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L3 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L3 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/CLX/MEM.txt b/collectors/likwid/groups/CLX/MEM.txt new file mode 100644 index 0000000..3d50ecb --- /dev/null +++ b/collectors/likwid/groups/CLX/MEM.txt @@ -0,0 +1,48 @@ +SHORT Main memory bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +MBOX0C0 CAS_COUNT_RD +MBOX0C1 CAS_COUNT_WR +MBOX1C0 CAS_COUNT_RD +MBOX1C1 CAS_COUNT_WR +MBOX2C0 CAS_COUNT_RD +MBOX2C1 CAS_COUNT_WR +MBOX3C0 CAS_COUNT_RD +MBOX3C1 CAS_COUNT_WR +MBOX4C0 CAS_COUNT_RD +MBOX4C1 CAS_COUNT_WR +MBOX5C0 CAS_COUNT_RD +MBOX5C1 CAS_COUNT_WR + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 + +LONG +Formulas: +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime +Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime +Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime +Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0 +- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Since this group is based on Uncore events it is only possible to measure on a +per socket base. Some of the counters may not be available on your system. +Also outputs total data volume transferred from main memory. +The same metrics are provided by the HA group. + diff --git a/collectors/likwid/groups/CLX/MEM_DP.txt b/collectors/likwid/groups/CLX/MEM_DP.txt new file mode 100644 index 0000000..68e8684 --- /dev/null +++ b/collectors/likwid/groups/CLX/MEM_DP.txt @@ -0,0 +1,70 @@ +SHORT Overview of arithmetic and main memory performance + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY +PWR3 PWR_DRAM_ENERGY +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE +PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE +MBOX0C0 CAS_COUNT_RD +MBOX0C1 CAS_COUNT_WR +MBOX1C0 CAS_COUNT_RD +MBOX1C1 CAS_COUNT_WR +MBOX2C0 CAS_COUNT_RD +MBOX2C1 CAS_COUNT_WR +MBOX3C0 CAS_COUNT_RD +MBOX3C1 CAS_COUNT_WR +MBOX4C0 CAS_COUNT_RD +MBOX4C1 CAS_COUNT_WR +MBOX5C0 CAS_COUNT_RD +MBOX5C1 CAS_COUNT_WR + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time +DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time +AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 +Operational intensity (PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0) + +LONG +Formulas: +Power [W] = PWR_PKG_ENERGY/runtime +Power DRAM [W] = PWR_DRAM_ENERGY/runtime +DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime +AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/runtime +Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/runtime +Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/runtime +Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0 +Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0) +-- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Since this group is based on Uncore events it is only possible to measure on +a per socket base. Also outputs total data volume transferred from main memory. +SSE scalar and packed double precision FLOP rates. Also reports on packed AVX +32b instructions. +The operational intensity is calculated using the FP values of the cores and the +memory data volume of the whole socket. The actual operational intensity for +multiple CPUs can be found in the statistics table in the Sum column. diff --git a/collectors/likwid/groups/CLX/MEM_SP.txt b/collectors/likwid/groups/CLX/MEM_SP.txt new file mode 100644 index 0000000..73452f2 --- /dev/null +++ b/collectors/likwid/groups/CLX/MEM_SP.txt @@ -0,0 +1,70 @@ +SHORT Overview of arithmetic and main memory performance + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY +PWR3 PWR_DRAM_ENERGY +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE +PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE +MBOX0C0 CAS_COUNT_RD +MBOX0C1 CAS_COUNT_WR +MBOX1C0 CAS_COUNT_RD +MBOX1C1 CAS_COUNT_WR +MBOX2C0 CAS_COUNT_RD +MBOX2C1 CAS_COUNT_WR +MBOX3C0 CAS_COUNT_RD +MBOX3C1 CAS_COUNT_WR +MBOX4C0 CAS_COUNT_RD +MBOX4C1 CAS_COUNT_WR +MBOX5C0 CAS_COUNT_RD +MBOX5C1 CAS_COUNT_WR + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time +SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time +AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 +Operational intensity (PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0) + +LONG +Formulas: +Power [W] = PWR_PKG_ENERGY/runtime +Power DRAM [W] = PWR_DRAM_ENERGY/runtime +SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime +AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/runtime +Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/runtime +Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/runtime +Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0 +Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0) +-- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Since this group is based on Uncore events it is only possible to measure on +a per socket base. Also outputs total data volume transferred from main memory. +SSE scalar and packed single precision FLOP rates. Also reports on packed AVX +32b instructions. +The operational intensity is calculated using the FP values of the cores and the +memory data volume of the whole socket. The actual operational intensity for +multiple CPUs can be found in the statistics table in the Sum column. diff --git a/collectors/likwid/groups/CLX/PMM.txt b/collectors/likwid/groups/CLX/PMM.txt new file mode 100644 index 0000000..dbaa6ab --- /dev/null +++ b/collectors/likwid/groups/CLX/PMM.txt @@ -0,0 +1,46 @@ +SHORT Intel Optance DC bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +MBOX0C0 PMM_CMD1_RD +MBOX0C1 PMM_CMD1_WR +MBOX1C0 PMM_CMD1_RD +MBOX1C1 PMM_CMD1_WR +MBOX2C0 PMM_CMD1_RD +MBOX2C1 PMM_CMD1_WR +MBOX3C0 PMM_CMD1_RD +MBOX3C1 PMM_CMD1_WR +MBOX4C0 PMM_CMD1_RD +MBOX4C1 PMM_CMD1_WR +MBOX5C0 PMM_CMD1_RD +MBOX5C1 PMM_CMD1_WR + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +PMM read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time +PMM read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0 +PMM write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time +PMM write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 +PMM bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time +PMM data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 + +LONG +Formulas: +PMM read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime +PMM read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0 +PMM write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime +PMM write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0 +PMM bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime +PMM data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0 +- +Profiling group to measure data rate and volume for accesses to Intel Optane DC +persistent memory. The Intel Optance DC devices are handled by the memory +controllers but require different events. + diff --git a/collectors/likwid/groups/CLX/TLB_DATA.txt b/collectors/likwid/groups/CLX/TLB_DATA.txt new file mode 100644 index 0000000..10ee5e1 --- /dev/null +++ b/collectors/likwid/groups/CLX/TLB_DATA.txt @@ -0,0 +1,35 @@ +SHORT L2 data TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 DTLB_LOAD_MISSES_CAUSES_A_WALK +PMC1 DTLB_STORE_MISSES_CAUSES_A_WALK +PMC2 DTLB_LOAD_MISSES_WALK_ACTIVE +PMC3 DTLB_STORE_MISSES_WALK_ACTIVE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 DTLB load misses PMC0 +L1 DTLB load miss rate PMC0/FIXC0 +L1 DTLB load miss duration [Cyc] PMC2/PMC0 +L1 DTLB store misses PMC1 +L1 DTLB store miss rate PMC1/FIXC0 +L1 DTLB store miss duration [Cyc] PMC3/PMC1 + +LONG +Formulas: +L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK +L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_ACTIVE / DTLB_LOAD_MISSES_CAUSES_A_WALK +L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK +L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_ACTIVE / DTLB_STORE_MISSES_CAUSES_A_WALK +- +The DTLB load and store miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/CLX/TLB_INSTR.txt b/collectors/likwid/groups/CLX/TLB_INSTR.txt new file mode 100644 index 0000000..9bc65a7 --- /dev/null +++ b/collectors/likwid/groups/CLX/TLB_INSTR.txt @@ -0,0 +1,28 @@ +SHORT L1 Instruction TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ITLB_MISSES_CAUSES_A_WALK +PMC1 ITLB_MISSES_WALK_ACTIVE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 ITLB misses PMC0 +L1 ITLB miss rate PMC0/FIXC0 +L1 ITLB miss duration [Cyc] PMC1/PMC0 + + +LONG +Formulas: +L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK +L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_ACTIVE / ITLB_MISSES_CAUSES_A_WALK +- +The ITLB miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/CLX/TMA.txt b/collectors/likwid/groups/CLX/TMA.txt new file mode 100644 index 0000000..afb4126 --- /dev/null +++ b/collectors/likwid/groups/CLX/TMA.txt @@ -0,0 +1,48 @@ +SHORT Top down cycle allocation + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_ANY +PMC1 UOPS_RETIRED_RETIRE_SLOTS +PMC2 IDQ_UOPS_NOT_DELIVERED_CORE +PMC3 INT_MISC_RECOVERY_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +IPC FIXC0/FIXC1 +Total Slots 4*FIXC1 +Slots Retired PMC1 +Fetch Bubbles PMC2 +Recovery Bubbles 4*PMC3 +Front End [%] PMC2/(4*FIXC1)*100 +Speculation [%] (PMC0-PMC1+(4*PMC3))/(4*FIXC1)*100 +Retiring [%] PMC1/(4*FIXC1)*100 +Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100 + +LONG +Formulas: +Total Slots = 4*CPU_CLK_UNHALTED_CORE +Slots Retired = UOPS_RETIRED_RETIRE_SLOTS +Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE +Recovery Bubbles = 4*INT_MISC_RECOVERY_CYCLES +Front End [%] = IDQ_UOPS_NOT_DELIVERED_CORE/(4*CPU_CLK_UNHALTED_CORE)*100 +Speculation [%] = (UOPS_ISSUED_ANY-UOPS_RETIRED_RETIRE_SLOTS+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)*100 +Retiring [%] = UOPS_RETIRED_RETIRE_SLOTS/(4*CPU_CLK_UNHALTED_CORE)*100 +Back End [%] = (1-((IDQ_UOPS_NOT_DELIVERED_CORE+UOPS_ISSUED_ANY+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)))*100 +-- +This performance group measures cycles to determine percentage of time spent in +front end, back end, retiring and speculation. These metrics are published and +verified by Intel. Further information: +Webpage describing Top-Down Method and its usage in Intel vTune: +https://software.intel.com/en-us/vtune-amplifier-help-tuning-applications-using-a-top-down-microarchitecture-analysis-method +Paper by Yasin Ahmad: +https://sites.google.com/site/analysismethods/yasin-pubs/TopDown-Yasin-ISPASS14.pdf?attredirects=0 +Slides by Yasin Ahmad: +http://www.cs.technion.ac.il/~erangi/TMA_using_Linux_perf__Ahmad_Yasin.pdf +The performance group was originally published here: +http://perf.mvermeulen.com/2018/04/14/top-down-performance-counter-analysis-part-1-likwid/ diff --git a/collectors/likwid/groups/CLX/UOPS_EXEC.txt b/collectors/likwid/groups/CLX/UOPS_EXEC.txt new file mode 100644 index 0000000..7042df7 --- /dev/null +++ b/collectors/likwid/groups/CLX/UOPS_EXEC.txt @@ -0,0 +1,31 @@ +SHORT UOPs execution + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_EXECUTED_USED_CYCLES +PMC1 UOPS_EXECUTED_STALL_CYCLES +PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES +PMC3:EDGEDETECT UOPS_EXECUTED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio [%] 100*PMC0/PMC2 +Unused cycles ratio [%] 100*PMC1/PMC2 +Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT + + +LONG +Formulas: +Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT +- +This performance group returns the ratios of used and unused cycles regarding +the execution stage in the pipeline. Used cycles are all cycles where uOPs are +executed while unused cycles refer to pipeline stalls. Moreover, the group +calculates the average stall duration in cycles. diff --git a/collectors/likwid/groups/CLX/UOPS_ISSUE.txt b/collectors/likwid/groups/CLX/UOPS_ISSUE.txt new file mode 100644 index 0000000..9aac923 --- /dev/null +++ b/collectors/likwid/groups/CLX/UOPS_ISSUE.txt @@ -0,0 +1,31 @@ +SHORT UOPs issueing + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_USED_CYCLES +PMC1 UOPS_ISSUED_STALL_CYCLES +PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES +PMC3:EDGEDETECT UOPS_ISSUED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio [%] 100*PMC0/PMC2 +Unused cycles ratio [%] 100*PMC1/PMC2 +Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT + + +LONG +Formulas: +Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT +- +This performance group returns the ratios of used and unused cycles regarding +the issue stage in the pipeline. Used cycles are all cycles where uOPs are +issued while unused cycles refer to pipeline stalls. Moreover, the group +calculates the average stall duration in cycles. diff --git a/collectors/likwid/groups/CLX/UOPS_RETIRE.txt b/collectors/likwid/groups/CLX/UOPS_RETIRE.txt new file mode 100644 index 0000000..0f37585 --- /dev/null +++ b/collectors/likwid/groups/CLX/UOPS_RETIRE.txt @@ -0,0 +1,31 @@ +SHORT UOPs retirement + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_RETIRED_USED_CYCLES +PMC1 UOPS_RETIRED_STALL_CYCLES +PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES +PMC3:EDGEDETECT UOPS_RETIRED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio [%] 100*PMC0/PMC2 +Unused cycles ratio [%] 100*PMC1/PMC2 +Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT + + +LONG +Formulas: +Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT +- +This performance group returns the ratios of used and unused cycles regarding +the retirement stage in the pipeline (re-order buffer). Used cycles are all +cycles where uOPs are retired while unused cycles refer to pipeline stalls. +Moreover, the group calculates the average stall duration in cycles. diff --git a/collectors/likwid/groups/CLX/UPI.txt b/collectors/likwid/groups/CLX/UPI.txt new file mode 100644 index 0000000..2a4c44f --- /dev/null +++ b/collectors/likwid/groups/CLX/UPI.txt @@ -0,0 +1,42 @@ +SHORT UPI data traffic + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +SBOX0C0 TXL_FLITS_ALL_DATA +SBOX0C1 RXL_FLITS_ALL_DATA +SBOX1C0 TXL_FLITS_ALL_DATA +SBOX1C1 RXL_FLITS_ALL_DATA +SBOX2C0 TXL_FLITS_ALL_DATA +SBOX2C1 RXL_FLITS_ALL_DATA + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Received data bandwidth [MByte/s] 1.0E-06*((SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0/time +Received data volume [GByte] 1.0E-09*((SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0 +Sent data bandwidth [MByte/s] 1.0E-06*((SBOX0C0+SBOX1C0+SBOX2C0)/9.0)*64.0/time +Sent data volume [GByte] 1.0E-09*((SBOX0C0+SBOX1C0+SBOX2C0)/9.0)*64.0 +Total data bandwidth [MByte/s] 1.0E-06*((SBOX0C0+SBOX1C0+SBOX2C0+SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0/time +Total data volume [GByte] 1.0E-09*((SBOX0C0+SBOX1C0+SBOX2C0+SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0 + + +LONG +Formulas: +Received data bandwidth [MByte/s] = 1.0E-06*(SUM(RXL_FLITS_ALL_DATA)/9.0)*64.0/runtime +Received data volume [GByte] = 1.0E-09*(SUM(RXL_FLITS_ALL_DATA)/9.0)*64.0 +Sent data bandwidth [MByte/s] = 1.0E-06*(SUM(TXL_FLITS_ALL_DATA)/9.0)*64.0/time +Sent data volume [GByte] = 1.0E-09*(SUM(TXL_FLITS_ALL_DATA)/9.0)*64.0 +Total data bandwidth [MByte/s] = 1.0E-06*((SUM(RXL_FLITS_ALL_DATA)+SUM(TXL_FLITS_ALL_DATA))/9.0)*64.0/time +Total data volume [GByte] = 1.0E-09*((SUM(RXL_FLITS_ALL_DATA)+SUM(TXL_FLITS_ALL_DATA))/9.0)*64.0 +-- +This group measures the data traffic on the UPI (socket interconnect). The group +measures all filled data slots (9 slots per 64 Byte data transfer), that's why +the count needs to be divided by 9. These 9 data chunks are not transferred in +a single flit but there is one flit for the header and three flits for the data. +The metrics show higher values as expected because the events count also +different transfers which include data. diff --git a/collectors/likwid/groups/ICL/BRANCH.txt b/collectors/likwid/groups/ICL/BRANCH.txt new file mode 100644 index 0000000..b8d41b2 --- /dev/null +++ b/collectors/likwid/groups/ICL/BRANCH.txt @@ -0,0 +1,31 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 BR_INST_RETIRED_ALL_BRANCHES +PMC1 BR_MISP_RETIRED_ALL_BRANCHES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Branch rate PMC0/FIXC0 +Branch misprediction rate PMC1/FIXC0 +Branch misprediction ratio PMC1/PMC0 +Instructions per branch FIXC0/PMC0 + +LONG +Formulas: +Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES +Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/ICL/DATA.txt b/collectors/likwid/groups/ICL/DATA.txt new file mode 100644 index 0000000..4e6e938 --- /dev/null +++ b/collectors/likwid/groups/ICL/DATA.txt @@ -0,0 +1,22 @@ +SHORT Load to store ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_INST_RETIRED_ALL_LOADS +PMC1 MEM_INST_RETIRED_ALL_STORES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Load to store ratio PMC0/PMC1 + +LONG +Formulas: +Load to store ratio = MEM_INST_RETIRED_ALL_LOADS/MEM_INST_RETIRED_ALL_STORES +- +This is a metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/ICL/DIVIDE.txt b/collectors/likwid/groups/ICL/DIVIDE.txt new file mode 100644 index 0000000..40b4ab6 --- /dev/null +++ b/collectors/likwid/groups/ICL/DIVIDE.txt @@ -0,0 +1,24 @@ +SHORT Divide unit information + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ARITH_DIVIDER_COUNT +PMC1 ARITH_DIVIDER_ACTIVE + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Number of divide ops PMC0 +Avg. divide unit usage duration PMC1/PMC0 + +LONG +Formulas: +Number of divide ops = ARITH_DIVIDER_COUNT +Avg. divide unit usage duration = ARITH_DIVIDER_ACTIVE/ARITH_DIVIDER_COUNT +- +This performance group measures the average latency of divide operations diff --git a/collectors/likwid/groups/ICL/ENERGY.txt b/collectors/likwid/groups/ICL/ENERGY.txt new file mode 100644 index 0000000..fe7829f --- /dev/null +++ b/collectors/likwid/groups/ICL/ENERGY.txt @@ -0,0 +1,35 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +TMP0 TEMP_CORE +PWR0 PWR_PKG_ENERGY +PWR1 PWR_PP0_ENERGY +PWR3 PWR_DRAM_ENERGY + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Temperature [C] TMP0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy PP0 [J] PWR1 +Power PP0 [W] PWR1/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Power PP0 = PWR_PP0_ENERGY / time +Power DRAM = PWR_DRAM_ENERGY / time +- +Broadwell implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) and DRAM level. + diff --git a/collectors/likwid/groups/ICL/FLOPS_AVX.txt b/collectors/likwid/groups/ICL/FLOPS_AVX.txt new file mode 100644 index 0000000..e44a913 --- /dev/null +++ b/collectors/likwid/groups/ICL/FLOPS_AVX.txt @@ -0,0 +1,25 @@ +SHORT Packed AVX MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE +PMC1 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE +PMC2 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE +PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Packed SP [MFLOP/s] 1.0E-06*(PMC0*8.0+PMC2*16.0)/time +Packed DP [MFLOP/s] 1.0E-06*(PMC1*4.0+PMC3*8.0)/time + +LONG +Formulas: +Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime +Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime +- +Packed 32b AVX FLOPs rates. diff --git a/collectors/likwid/groups/ICL/FLOPS_DP.txt b/collectors/likwid/groups/ICL/FLOPS_DP.txt new file mode 100644 index 0000000..177cff2 --- /dev/null +++ b/collectors/likwid/groups/ICL/FLOPS_DP.txt @@ -0,0 +1,34 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE +PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time +AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time +AVX512 DP [MFLOP/s] 1.0E-06*(PMC3*8.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3) + +LONG +Formulas: +DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime +AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime +AVX512 DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime +Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE) +- +SSE scalar and packed double precision FLOP rates. + diff --git a/collectors/likwid/groups/ICL/FLOPS_SP.txt b/collectors/likwid/groups/ICL/FLOPS_SP.txt new file mode 100644 index 0000000..01d98c2 --- /dev/null +++ b/collectors/likwid/groups/ICL/FLOPS_SP.txt @@ -0,0 +1,34 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE +PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time +AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time +AVX512 SP [MFLOP/s] 1.0E-06*(PMC3*16.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3) + +LONG +Formulas: +SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime +AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime +AVX512 SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime +Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE) +- +SSE scalar and packed single precision FLOP rates. + diff --git a/collectors/likwid/groups/ICX/BRANCH.txt b/collectors/likwid/groups/ICX/BRANCH.txt new file mode 100644 index 0000000..3eea828 --- /dev/null +++ b/collectors/likwid/groups/ICX/BRANCH.txt @@ -0,0 +1,32 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +FIXC3 TOPDOWN_SLOTS +PMC0 BR_INST_RETIRED_ALL_BRANCHES +PMC1 BR_MISP_RETIRED_ALL_BRANCHES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Branch rate PMC0/FIXC0 +Branch misprediction rate PMC1/FIXC0 +Branch misprediction ratio PMC1/PMC0 +Instructions per branch FIXC0/PMC0 + +LONG +Formulas: +Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES +Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/ICX/DATA.txt b/collectors/likwid/groups/ICX/DATA.txt new file mode 100644 index 0000000..ee15427 --- /dev/null +++ b/collectors/likwid/groups/ICX/DATA.txt @@ -0,0 +1,23 @@ +SHORT Load to store ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +FIXC3 TOPDOWN_SLOTS +PMC0 MEM_INST_RETIRED_ALL_LOADS +PMC1 MEM_INST_RETIRED_ALL_STORES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Load to store ratio PMC0/PMC1 + +LONG +Formulas: +Load to store ratio = MEM_INST_RETIRED_ALL_LOADS/MEM_INST_RETIRED_ALL_STORES +- +This is a metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/ICX/DIVIDE.txt b/collectors/likwid/groups/ICX/DIVIDE.txt new file mode 100644 index 0000000..5e3be16 --- /dev/null +++ b/collectors/likwid/groups/ICX/DIVIDE.txt @@ -0,0 +1,25 @@ +SHORT Divide unit information + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +FIXC3 TOPDOWN_SLOTS +PMC0 ARITH_DIVIDER_COUNT +PMC1 ARITH_DIVIDER_ACTIVE + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Number of divide ops PMC0 +Avg. divide unit usage duration PMC1/PMC0 + +LONG +Formulas: +Number of divide ops = ARITH_DIVIDER_COUNT +Avg. divide unit usage duration = ARITH_DIVIDER_ACTIVE/ARITH_DIVIDER_COUNT +- +This performance group measures the average latency of divide operations diff --git a/collectors/likwid/groups/ICX/FLOPS_AVX.txt b/collectors/likwid/groups/ICX/FLOPS_AVX.txt new file mode 100644 index 0000000..0f41891 --- /dev/null +++ b/collectors/likwid/groups/ICX/FLOPS_AVX.txt @@ -0,0 +1,26 @@ +SHORT Packed AVX MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +FIXC3 TOPDOWN_SLOTS +PMC0 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE +PMC1 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE +PMC2 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE +PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Packed SP [MFLOP/s] 1.0E-06*(PMC0*8.0+PMC2*16.0)/time +Packed DP [MFLOP/s] 1.0E-06*(PMC1*4.0+PMC3*8.0)/time + +LONG +Formulas: +Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime +Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime +- +Packed 32b AVX FLOPs rates. diff --git a/collectors/likwid/groups/ICX/FLOPS_DP.txt b/collectors/likwid/groups/ICX/FLOPS_DP.txt new file mode 100644 index 0000000..64e7d3d --- /dev/null +++ b/collectors/likwid/groups/ICX/FLOPS_DP.txt @@ -0,0 +1,35 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +FIXC3 TOPDOWN_SLOTS +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE +PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time +AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time +AVX512 DP [MFLOP/s] 1.0E-06*(PMC3*8.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3) + +LONG +Formulas: +DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime +AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime +AVX512 DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime +Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE) +- +SSE scalar and packed double precision FLOP rates. + diff --git a/collectors/likwid/groups/ICX/FLOPS_SP.txt b/collectors/likwid/groups/ICX/FLOPS_SP.txt new file mode 100644 index 0000000..3e6780b --- /dev/null +++ b/collectors/likwid/groups/ICX/FLOPS_SP.txt @@ -0,0 +1,35 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +FIXC3 TOPDOWN_SLOTS +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE +PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time +AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time +AVX512 SP [MFLOP/s] 1.0E-06*(PMC3*16.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3) + +LONG +Formulas: +SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime +AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime +AVX512 SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime +Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE) +- +SSE scalar and packed single precision FLOP rates. + diff --git a/collectors/likwid/groups/ICX/L2.txt b/collectors/likwid/groups/ICX/L2.txt new file mode 100644 index 0000000..efb6a1f --- /dev/null +++ b/collectors/likwid/groups/ICX/L2.txt @@ -0,0 +1,39 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +FIXC3 TOPDOWN_SLOTS +PMC0 L1D_REPLACEMENT +PMC1 L2_TRANS_L1D_WB +PMC2 ICACHE_64B_IFTAG_MISS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2D load data volume [GBytes] 1.0E-09*PMC0*64.0 +L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time +L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0 + +LONG +Formulas: +L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time +L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0 +L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time +L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0 +L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_64B_IFTAG_MISS)*64/time +L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_64B_IFTAG_MISS)*64 +- +Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L1 and the number of modified cache lines +evicted from the L1. The group also output total data volume transferred between +L2 and L1. Note that this bandwidth also includes data transfers due to a write +allocate load on a store miss in L1 and traffic caused by misses in the +L1 instruction cache. + diff --git a/collectors/likwid/groups/TGL/BRANCH.txt b/collectors/likwid/groups/TGL/BRANCH.txt new file mode 100644 index 0000000..b8d41b2 --- /dev/null +++ b/collectors/likwid/groups/TGL/BRANCH.txt @@ -0,0 +1,31 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 BR_INST_RETIRED_ALL_BRANCHES +PMC1 BR_MISP_RETIRED_ALL_BRANCHES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Branch rate PMC0/FIXC0 +Branch misprediction rate PMC1/FIXC0 +Branch misprediction ratio PMC1/PMC0 +Instructions per branch FIXC0/PMC0 + +LONG +Formulas: +Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES +Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/TGL/DATA.txt b/collectors/likwid/groups/TGL/DATA.txt new file mode 100644 index 0000000..4e6e938 --- /dev/null +++ b/collectors/likwid/groups/TGL/DATA.txt @@ -0,0 +1,22 @@ +SHORT Load to store ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_INST_RETIRED_ALL_LOADS +PMC1 MEM_INST_RETIRED_ALL_STORES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Load to store ratio PMC0/PMC1 + +LONG +Formulas: +Load to store ratio = MEM_INST_RETIRED_ALL_LOADS/MEM_INST_RETIRED_ALL_STORES +- +This is a metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/TGL/DIVIDE.txt b/collectors/likwid/groups/TGL/DIVIDE.txt new file mode 100644 index 0000000..40b4ab6 --- /dev/null +++ b/collectors/likwid/groups/TGL/DIVIDE.txt @@ -0,0 +1,24 @@ +SHORT Divide unit information + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ARITH_DIVIDER_COUNT +PMC1 ARITH_DIVIDER_ACTIVE + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Number of divide ops PMC0 +Avg. divide unit usage duration PMC1/PMC0 + +LONG +Formulas: +Number of divide ops = ARITH_DIVIDER_COUNT +Avg. divide unit usage duration = ARITH_DIVIDER_ACTIVE/ARITH_DIVIDER_COUNT +- +This performance group measures the average latency of divide operations diff --git a/collectors/likwid/groups/TGL/ENERGY.txt b/collectors/likwid/groups/TGL/ENERGY.txt new file mode 100644 index 0000000..fe7829f --- /dev/null +++ b/collectors/likwid/groups/TGL/ENERGY.txt @@ -0,0 +1,35 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +TMP0 TEMP_CORE +PWR0 PWR_PKG_ENERGY +PWR1 PWR_PP0_ENERGY +PWR3 PWR_DRAM_ENERGY + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Temperature [C] TMP0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy PP0 [J] PWR1 +Power PP0 [W] PWR1/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Power PP0 = PWR_PP0_ENERGY / time +Power DRAM = PWR_DRAM_ENERGY / time +- +Broadwell implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) and DRAM level. + diff --git a/collectors/likwid/groups/TGL/FLOPS_AVX.txt b/collectors/likwid/groups/TGL/FLOPS_AVX.txt new file mode 100644 index 0000000..e44a913 --- /dev/null +++ b/collectors/likwid/groups/TGL/FLOPS_AVX.txt @@ -0,0 +1,25 @@ +SHORT Packed AVX MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE +PMC1 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE +PMC2 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE +PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Packed SP [MFLOP/s] 1.0E-06*(PMC0*8.0+PMC2*16.0)/time +Packed DP [MFLOP/s] 1.0E-06*(PMC1*4.0+PMC3*8.0)/time + +LONG +Formulas: +Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime +Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime +- +Packed 32b AVX FLOPs rates. diff --git a/collectors/likwid/groups/TGL/FLOPS_DP.txt b/collectors/likwid/groups/TGL/FLOPS_DP.txt new file mode 100644 index 0000000..177cff2 --- /dev/null +++ b/collectors/likwid/groups/TGL/FLOPS_DP.txt @@ -0,0 +1,34 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE +PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time +AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time +AVX512 DP [MFLOP/s] 1.0E-06*(PMC3*8.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3) + +LONG +Formulas: +DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime +AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime +AVX512 DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime +Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE) +- +SSE scalar and packed double precision FLOP rates. + diff --git a/collectors/likwid/groups/TGL/FLOPS_SP.txt b/collectors/likwid/groups/TGL/FLOPS_SP.txt new file mode 100644 index 0000000..01d98c2 --- /dev/null +++ b/collectors/likwid/groups/TGL/FLOPS_SP.txt @@ -0,0 +1,34 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE +PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time +AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time +AVX512 SP [MFLOP/s] 1.0E-06*(PMC3*16.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3) + +LONG +Formulas: +SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime +AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime +AVX512 SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime +Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE) +- +SSE scalar and packed single precision FLOP rates. + diff --git a/collectors/likwid/groups/arm64fx/BRANCH.txt b/collectors/likwid/groups/arm64fx/BRANCH.txt new file mode 100644 index 0000000..dda12fb --- /dev/null +++ b/collectors/likwid/groups/arm64fx/BRANCH.txt @@ -0,0 +1,30 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 BR_PRED +PMC3 BR_MIS_PRED + + +METRICS +Runtime (RDTSC) [s] time +CPI PMC1/PMC0 +Branch rate PMC2/PMC0 +Branch misprediction rate PMC3/PMC0 +Branch misprediction ratio PMC3/(PMC2+PMC3) +Instructions per branch PMC0/(PMC2+PMC3) + +LONG +Formulas: +CPI = CPU_CYCLES/INST_RETIRED +Branch rate = BR_PRED/INST_RETIRED +Branch misprediction rate = BR_MIS_PRED/INST_RETIRED +Branch misprediction ratio = BR_MIS_PRED/(BR_PRED+BR_MIS_PRED) +Instructions per branch = INSTR_RETIRED_ANY/(BR_PRED+BR_MIS_PRED) +- +The rates state how often in average a branch or a mispredicted branch occured +per instruction retired in total. The Branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/Branch rate. + diff --git a/collectors/likwid/groups/arm64fx/DATA.txt b/collectors/likwid/groups/arm64fx/DATA.txt new file mode 100644 index 0000000..40f9cb3 --- /dev/null +++ b/collectors/likwid/groups/arm64fx/DATA.txt @@ -0,0 +1,24 @@ +SHORT Load to store ratio + +EVENTSET +PMC0 INST_SPEC +PMC1 CPU_CYCLES +PMC2 LD_SPEC +PMC3 ST_SPEC + +METRICS +Runtime (RDTSC) [s] time +CPI PMC1/PMC0 +Load to store ratio PMC2/PMC3 +Load ratio PMC2/PMC0 +Store ratio PMC3/PMC0 + +LONG +Formulas: +CPI = CPU_CYCLES/INST_SPEC +Load to store ratio = LD_SPEC / ST_SPEC +Load ratio = LD_SPEC / INST_SPEC +Store ratio = ST_SPEC / INST_SPEC +- +This is a metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/arm64fx/FLOPS_DP.txt b/collectors/likwid/groups/arm64fx/FLOPS_DP.txt new file mode 100644 index 0000000..5e8a565 --- /dev/null +++ b/collectors/likwid/groups/arm64fx/FLOPS_DP.txt @@ -0,0 +1,26 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC3 FP_DP_FIXED_OPS_SPEC +PMC4 FP_DP_SCALE_OPS_SPEC + +METRICS +Runtime (RDTSC) [s] time +Clock [MHz] 1.E-06*PMC1/time +CPI PMC1/PMC0 +DP (FP) [MFLOP/s] 1E-06*(PMC3)/time +DP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC4*128)/128)+PMC3)/time +DP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC4*256)/128)+PMC3)/time +DP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC4*512)/128)+PMC3)/time + +LONG +Formulas: +DP (FP) [MFLOP/s] = 1E-06*FP_DP_FIXED_OPS_SPEC/time +DP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*128)/128))/time +DP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*256)/128))/time +DP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*512)/128))/time +- +Double-precision FP rate for scalar and SVE vector operations with different widths. The events for +the SVE metrics assumes that all vector elements are active. diff --git a/collectors/likwid/groups/arm64fx/FLOPS_HP.txt b/collectors/likwid/groups/arm64fx/FLOPS_HP.txt new file mode 100644 index 0000000..4f449a2 --- /dev/null +++ b/collectors/likwid/groups/arm64fx/FLOPS_HP.txt @@ -0,0 +1,26 @@ +SHORT Half-Precision MFLOP/s + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC3 FP_HP_FIXED_OPS_SPEC +PMC4 FP_HP_SCALE_OPS_SPEC + +METRICS +Runtime (RDTSC) [s] time +Clock [MHz] 1.E-06*PMC1/time +CPI PMC1/PMC0 +HP (FP) [MFLOP/s] 1E-06*(PMC3)/time +HP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC4*128)/128)+PMC3)/time +HP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC4*256)/128)+PMC3)/time +HP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC4*512)/128)+PMC3)/time + +LONG +Formulas: +HP (FP) [MFLOP/s] = 1E-06*FP_HP_FIXED_OPS_SPEC/time +HP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_SPEC+((FP_HP_SCALE_OPS_SPEC*128)/128))/time +HP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_SPEC+((FP_HP_SCALE_OPS_SPEC*256)/128))/time +HP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_SPEC+((FP_HP_SCALE_OPS_SPEC*512)/128))/time +- +Half-precision FP rate for scalar and SVE vector operations with different widths. The events for +the SVE metrics assumes that all vector elements are active. diff --git a/collectors/likwid/groups/arm64fx/FLOPS_SP.txt b/collectors/likwid/groups/arm64fx/FLOPS_SP.txt new file mode 100644 index 0000000..d3248eb --- /dev/null +++ b/collectors/likwid/groups/arm64fx/FLOPS_SP.txt @@ -0,0 +1,26 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC3 FP_SP_FIXED_OPS_SPEC +PMC4 FP_SP_SCALE_OPS_SPEC + +METRICS +Runtime (RDTSC) [s] time +Clock [MHz] 1.E-06*PMC1/time +CPI PMC1/PMC0 +SP (FP) [MFLOP/s] 1E-06*(PMC3)/time +SP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC4*128)/128)+PMC3)/time +SP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC4*256)/128)+PMC3)/time +SP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC4*512)/128)+PMC3)/time + +LONG +Formulas: +SP (FP) [MFLOP/s] = 1E-06*FP_SP_FIXED_OPS_SPEC/time +SP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*128)/128))/time +SP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*256)/128))/time +SP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*512)/128))/time +- +Single-precision FP rate for scalar and SVE vector operations with different widths. The events for +the SVE metrics assumes that all vector elements are active. diff --git a/collectors/likwid/groups/arm64fx/FP_PIPE.txt b/collectors/likwid/groups/arm64fx/FP_PIPE.txt new file mode 100644 index 0000000..2cde7ef --- /dev/null +++ b/collectors/likwid/groups/arm64fx/FP_PIPE.txt @@ -0,0 +1,33 @@ +SHORT Utilization of FP pipelines + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 FLA_VAL +PMC3 FLA_VAL_PRD_CNT +PMC4 FLB_VAL +PMC5 FLB_VAL_PRD_CNT + +METRICS +Runtime (RDTSC) [s] time +CPI PMC1/PMC0 +FP operation pipeline A busy rate [%] (PMC2/PMC1)*100.0 +FP pipeline A active element rate [%] (PMC3/(PMC2*16))*100.0 +FP operation pipeline B busy rate [%] (PMC4/PMC1)*100.0 +FP pipeline B active element rate [%] (PMC5/(PMC4*16))*100.0 + + +LONG +Formulas: +CPI = CPU_CYCLES/INST_SPEC +FP operation pipeline A busy rate [%] = (FLA_VAL/CPU_CYCLES)*100.0 +FP pipeline A active element rate [%] = (FLA_VAL_PRD_CNT/(FLA_VAL*16))*100.0 +FP operation pipeline B busy rate [%] = (FLB_VAL/CPU_CYCLES)*100.0 +FP pipeline B active element rate [%] = (FLB_VAL_PRD_CNT/(FLB_VAL*16))*100.0 +- +FLx_VAL: This event counts valid cycles of FLx pipeline. +FLx_VAL_PRD_CNT: This event counts the number of 1's in the predicate bits of + request in FLA pipeline, where it is corrected so that it + becomes 16 when all bits are 1. +So each predicate mask has 16 slots, so there are 16 slots per cycle in FLA and +FLB. FLA is partly used by other instructions like SVE stores. diff --git a/collectors/likwid/groups/arm64fx/ICACHE.txt b/collectors/likwid/groups/arm64fx/ICACHE.txt new file mode 100644 index 0000000..6a0bbea --- /dev/null +++ b/collectors/likwid/groups/arm64fx/ICACHE.txt @@ -0,0 +1,24 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 L1I_CACHE +PMC3 L1I_CACHE_REFILL + + +METRICS +Runtime (RDTSC) [s] time +CPI PMC1/PMC0 +L1I request rate PMC2/PMC0 +L1I miss rate PMC3/PMC0 +L1I miss ratio PMC3/PMC2 + +LONG +Formulas: +CPI = CPU_CYCLES/INST_RETIRED +L1I request rate = L1I_CACHE / INST_RETIRED +L1I miss rate = L1I_CACHE_REFILL / INST_RETIRED +L1I miss ratio = L1I_CACHE_REFILL / L1I_CACHE +- +This group measures some L1 instruction cache metrics. diff --git a/collectors/likwid/groups/arm64fx/L2.txt b/collectors/likwid/groups/arm64fx/L2.txt new file mode 100644 index 0000000..be47585 --- /dev/null +++ b/collectors/likwid/groups/arm64fx/L2.txt @@ -0,0 +1,40 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 L1D_CACHE_REFILL +PMC3 L1D_CACHE_WB +PMC4 L1I_CACHE_REFILL + + +METRICS +Runtime (RDTSC) [s] time +CPI PMC1/PMC0 +L1D<-L2 load bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time +L1D<-L2 load data volume [GBytes] 1.0E-09*(PMC2)*256.0 +L1D->L2 evict bandwidth [MBytes/s] 1.0E-06*PMC3*256.0/time +L1D->L2 evict data volume [GBytes] 1.0E-09*PMC3*256.0 +L1I<-L2 load bandwidth [MBytes/s] 1.0E-06*PMC4*256.0/time +L1I<-L2 load data volume [GBytes] 1.0E-09*PMC4*256.0 +L1<->L2 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3+PMC4)*256.0/time +L1<->L2 data volume [GBytes] 1.0E-09*(PMC2+PMC3+PMC4)*256.0 + +LONG +Formulas: +CPI = CPU_CYCLES/INST_RETIRED +L1D<-L2 load bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_REFILL*256.0/time +L1D<-L2 load data volume [GBytes] = 1.0E-09*L1D_CACHE_REFILL*256.0 +L1D->L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_WB*256.0/time +L1D->L2 evict data volume [GBytes] = 1.0E-09*L1D_CACHE_WB*256.0 +L1I<-L2 load bandwidth [MBytes/s] = 1.0E-06*L1I_CACHE_REFILL*256.0/time +L1I<-L2 load data volume [GBytes] = 1.0E-09*L1I_CACHE_REFILL*256.0 +L1<->L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*256.0/time +L1<->L2 data volume [GBytes] = 1.0E-09*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*256.0 +- +Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the +number of cacheline loaded from the L2 to the L1 data cache and the writebacks from +the L1 data cache to the L2 cache. The group also outputs total data volume transfered between +L2 and L1. Note that this bandwidth also includes data transfers due to a write +allocate load on a store miss in L1 and cachelines transfered in the L1 instruction +cache. diff --git a/collectors/likwid/groups/arm64fx/MEM.txt b/collectors/likwid/groups/arm64fx/MEM.txt new file mode 100644 index 0000000..b192b8b --- /dev/null +++ b/collectors/likwid/groups/arm64fx/MEM.txt @@ -0,0 +1,29 @@ +SHORT Main memory bandwidth in MBytes/s + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 BUS_READ_TOTAL_MEM +PMC3 BUS_WRITE_TOTAL_MEM + + +METRICS +Runtime (RDTSC) [s] time +CPI PMC1/PMC0 +Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time +Memory read data volume [GBytes] 1.0E-09*(PMC2)*256.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time +Memory write data volume [GBytes] 1.0E-09*(PMC3)*256.0 +Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time +Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0 + +LONG +Formulas: +Memory read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM)*256.0/runtime +Memory read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM)*256.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_MEM)*256.0/runtime +Memory write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_MEM)*256.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0/runtime +Memory data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0 +- +Profiling group to measure memory bandwidth. The cache line size is 256 Byte. diff --git a/collectors/likwid/groups/arm64fx/MEM_DP.txt b/collectors/likwid/groups/arm64fx/MEM_DP.txt new file mode 100644 index 0000000..96506ff --- /dev/null +++ b/collectors/likwid/groups/arm64fx/MEM_DP.txt @@ -0,0 +1,50 @@ +SHORT Overview of arithmetic and main memory performance + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 BUS_READ_TOTAL_MEM +PMC3 BUS_WRITE_TOTAL_MEM +PMC4 FP_DP_FIXED_OPS_SPEC +PMC5 FP_DP_SCALE_OPS_SPEC + + +METRICS +Runtime (RDTSC) [s] time +CPI PMC1/PMC0 +DP (FP) [MFLOP/s] 1E-06*(PMC4)/time +DP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC5*128.0)/128.0)+PMC4)/time +DP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC5*256.0)/128.0)+PMC4)/time +DP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC5*512.0)/128.0)+PMC4)/time +Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time +Memory read data volume [GBytes] 1.0E-09*(PMC2)*256.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time +Memory write data volume [GBytes] 1.0E-09*(PMC3)*256.0 +Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time +Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0 +Operational intensity (FP) PMC4/((PMC2+PMC3)*256.0) +Operational intensity (FP+SVE128) (((PMC5*128.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0) +Operational intensity (FP+SVE256) (((PMC5*256.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0) +Operational intensity (FP+SVE512) (((PMC5*512.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0) + + +LONG +Formulas: +DP (FP) [MFLOP/s] = 1E-06*FP_DP_FIXED_OPS_SPEC/time +DP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*128)/128))/time +DP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*256)/128))/time +DP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*512)/128))/time +Memory read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM)*256.0/runtime +Memory read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM)*256.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_MEM)*256.0/runtime +Memory write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_MEM)*256.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0/runtime +Memory data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0 +Operational intensity (FP) = FP_DP_FIXED_OPS_SPEC/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0) +Operational intensity (FP+SVE128) = (FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*128)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0) +Operational intensity (FP+SVE256) = (FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*256)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0) +Operational intensity (FP+SVE512) = (FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*512)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0) +- +Profiling group to measure memory bandwidth and double-precision FP rate for scalar and SVE vector +operations with different widths. The events for the SVE metrics assumes that all vector elements +are active. The cache line size is 256 Byte. diff --git a/collectors/likwid/groups/arm64fx/MEM_HP.txt b/collectors/likwid/groups/arm64fx/MEM_HP.txt new file mode 100644 index 0000000..17d86e9 --- /dev/null +++ b/collectors/likwid/groups/arm64fx/MEM_HP.txt @@ -0,0 +1,50 @@ +SHORT Overview of arithmetic and main memory performance + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 BUS_READ_TOTAL_MEM +PMC3 BUS_WRITE_TOTAL_MEM +PMC4 FP_HP_FIXED_OPS_HPEC +PMC5 FP_HP_SCALE_OPS_HPEC + + +METRICS +Runtime (RDTSC) [s] time +CPI PMC1/PMC0 +HP (FP) [MFLOP/s] 1E-06*(PMC4)/time +HP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC5*128.0)/128.0)+PMC4)/time +HP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC5*256.0)/128.0)+PMC4)/time +HP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC5*512.0)/128.0)+PMC4)/time +Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time +Memory read data volume [GBytes] 1.0E-09*(PMC2)*256.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time +Memory write data volume [GBytes] 1.0E-09*(PMC3)*256.0 +Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time +Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0 +Operational intensity (FP) PMC4/((PMC2+PMC3)*256.0) +Operational intensity (FP+SVE128) (((PMC5*128.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0) +Operational intensity (FP+SVE256) (((PMC5*256.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0) +Operational intensity (FP+SVE512) (((PMC5*512.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0) + + +LONG +Formulas: +HP (FP) [MFLOP/s] = 1E-06*FP_HP_FIXED_OPS_HPEC/time +HP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*128)/128))/time +HP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*256)/128))/time +HP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*512)/128))/time +Memory read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM)*256.0/runtime +Memory read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM)*256.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_MEM)*256.0/runtime +Memory write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_MEM)*256.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0/runtime +Memory data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0 +Operational intensity (FP) = FP_HP_FIXED_OPS_HPEC/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0) +Operational intensity (FP+SVE128) = (FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*128)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0) +Operational intensity (FP+SVE256) = (FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*256)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0) +Operational intensity (FP+SVE512) = (FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*512)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0) +- +Profiling group to measure memory bandwidth and half-precision FP rate for scalar and SVE vector +operations with different widths. The events for the SVE metrics assumes that all vector elements +are active. The cache line size is 256 Byte. diff --git a/collectors/likwid/groups/arm64fx/MEM_SP.txt b/collectors/likwid/groups/arm64fx/MEM_SP.txt new file mode 100644 index 0000000..b6220b0 --- /dev/null +++ b/collectors/likwid/groups/arm64fx/MEM_SP.txt @@ -0,0 +1,50 @@ +SHORT Overview of arithmetic and main memory performance + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 BUS_READ_TOTAL_MEM +PMC3 BUS_WRITE_TOTAL_MEM +PMC4 FP_SP_FIXED_OPS_SPEC +PMC5 FP_SP_SCALE_OPS_SPEC + + +METRICS +Runtime (RDTSC) [s] time +CPI PMC1/PMC0 +SP (FP) [MFLOP/s] 1E-06*(PMC4)/time +SP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC5*128.0)/128.0)+PMC4)/time +SP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC5*256.0)/128.0)+PMC4)/time +SP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC5*512.0)/128.0)+PMC4)/time +Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time +Memory read data volume [GBytes] 1.0E-09*(PMC2)*256.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time +Memory write data volume [GBytes] 1.0E-09*(PMC3)*256.0 +Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time +Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0 +Operational intensity (FP) PMC4/((PMC2+PMC3)*256.0) +Operational intensity (FP+SVE128) (((PMC5*128.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0) +Operational intensity (FP+SVE256) (((PMC5*256.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0) +Operational intensity (FP+SVE512) (((PMC5*512.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0) + + +LONG +Formulas: +SP (FP) [MFLOP/s] = 1E-06*FP_SP_FIXED_OPS_SPEC/time +SP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*128)/128))/time +SP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*256)/128))/time +SP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*512)/128))/time +Memory read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM)*256.0/runtime +Memory read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM)*256.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_MEM)*256.0/runtime +Memory write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_MEM)*256.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0/runtime +Memory data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0 +Operational intensity (FP) = FP_SP_FIXED_OPS_SPEC/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0) +Operational intensity (FP+SVE128) = (FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*128)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0) +Operational intensity (FP+SVE256) = (FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*256)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0) +Operational intensity (FP+SVE512) = (FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*512)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0) +- +Profiling group to measure memory bandwidth and single-precision FP rate for scalar and SVE vector +operations with different widths. The events for the SVE metrics assumes that all vector elements +are active. The cache line size is 256 Byte. diff --git a/collectors/likwid/groups/arm64fx/PCI.txt b/collectors/likwid/groups/arm64fx/PCI.txt new file mode 100644 index 0000000..bca76a6 --- /dev/null +++ b/collectors/likwid/groups/arm64fx/PCI.txt @@ -0,0 +1,29 @@ +SHORT PCI bandwidth in MBytes/s + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 BUS_READ_TOTAL_PCI +PMC3 BUS_WRITE_TOTAL_PCI + + +METRICS +Runtime (RDTSC) [s] time +CPI PMC1/PMC0 +PCI read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time +PCI read data volume [GBytes] 1.0E-09*(PMC2)*256.0 +PCI write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time +PCI write data volume [GBytes] 1.0E-09*(PMC3)*256.0 +PCI bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time +PCI data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0 + +LONG +Formulas: +PCI read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_PCI)*256.0/runtime +PCI read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_PCI)*256.0 +PCI write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_PCI)*256.0/runtime +PCI write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_PCI)*256.0 +PCI bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_PCI+BUS_WRITE_TOTAL_PCI)*256.0/runtime +PCI data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_PCI+BUS_WRITE_TOTAL_PCI)*256.0 +- +Profiling group to measure PCI bandwidth. The cache line size is 256 Byte. diff --git a/collectors/likwid/groups/arm64fx/TOFU.txt b/collectors/likwid/groups/arm64fx/TOFU.txt new file mode 100644 index 0000000..2bebe3e --- /dev/null +++ b/collectors/likwid/groups/arm64fx/TOFU.txt @@ -0,0 +1,29 @@ +SHORT TOFU bandwidth in MBytes/s + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 BUS_READ_TOTAL_TOFU +PMC3 BUS_WRITE_TOTAL_TOFU + + +METRICS +Runtime (RDTSC) [s] time +CPI PMC1/PMC0 +TOFU read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time +TOFU read data volume [GBytes] 1.0E-09*(PMC2)*256.0 +TOFU write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time +TOFU write data volume [GBytes] 1.0E-09*(PMC3)*256.0 +TOFU bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time +TOFU data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0 + +LONG +Formulas: +TOFU read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_TOFU)*256.0/runtime +TOFU read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_TOFU)*256.0 +TOFU write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_TOFU)*256.0/runtime +TOFU write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_TOFU)*256.0 +TOFU bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_TOFU+BUS_WRITE_TOTAL_TOFU)*256.0/runtime +TOFU data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_TOFU+BUS_WRITE_TOTAL_TOFU)*256.0 +- +Profiling group to measure TOFU bandwidth. The cache line size is 256 Byte. diff --git a/collectors/likwid/groups/arm8/BRANCH.txt b/collectors/likwid/groups/arm8/BRANCH.txt new file mode 100644 index 0000000..8cd4f00 --- /dev/null +++ b/collectors/likwid/groups/arm8/BRANCH.txt @@ -0,0 +1,31 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 BR_PRED +PMC3 BR_MIS_PRED +PMC4 INST_SPEC + + +METRICS +Runtime (RDTSC) [s] time +CPI PMC1/PMC0 +Branch rate PMC2/PMC0 +Branch misprediction rate PMC3/PMC0 +Branch misprediction ratio PMC3/(PMC2+PMC3) +Instructions per branch PMC0/(PMC2+PMC3) + +LONG +Formulas: +CPI = CPU_CYCLES/INST_RETIRED +Branch rate = BR_PRED/INST_RETIRED +Branch misprediction rate = BR_MIS_PRED/INST_RETIRED +Branch misprediction ratio = BR_MIS_PRED/(BR_PRED+BR_MIS_PRED) +Instructions per branch = INSTR_RETIRED_ANY/(BR_PRED+BR_MIS_PRED) +- +The rates state how often in average a branch or a mispredicted branch occured +per instruction retired in total. The Branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/Branch rate. + diff --git a/collectors/likwid/groups/arm8/DATA.txt b/collectors/likwid/groups/arm8/DATA.txt new file mode 100644 index 0000000..4338d90 --- /dev/null +++ b/collectors/likwid/groups/arm8/DATA.txt @@ -0,0 +1,24 @@ +SHORT Load to store ratio + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 LD_RETIRED +PMC3 ST_RETIRED + +METRICS +Runtime (RDTSC) [s] time +CPI PMC1/PMC0 +Load to store ratio PMC2/PMC3 +Load ratio PMC2/PMC0 +Store ratio PMC3/PMC0 + +LONG +Formulas: +CPI = CPU_CYCLES/INST_RETIRED +Load to store ratio = LD_RETIRED / ST_RETIRED +Load ratio = LD_RETIRED / INST_RETIRED +Store ratio = ST_RETIRED / INST_RETIRED +- +This is a metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/arm8/ICACHE.txt b/collectors/likwid/groups/arm8/ICACHE.txt new file mode 100644 index 0000000..6a0bbea --- /dev/null +++ b/collectors/likwid/groups/arm8/ICACHE.txt @@ -0,0 +1,24 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 L1I_CACHE +PMC3 L1I_CACHE_REFILL + + +METRICS +Runtime (RDTSC) [s] time +CPI PMC1/PMC0 +L1I request rate PMC2/PMC0 +L1I miss rate PMC3/PMC0 +L1I miss ratio PMC3/PMC2 + +LONG +Formulas: +CPI = CPU_CYCLES/INST_RETIRED +L1I request rate = L1I_CACHE / INST_RETIRED +L1I miss rate = L1I_CACHE_REFILL / INST_RETIRED +L1I miss ratio = L1I_CACHE_REFILL / L1I_CACHE +- +This group measures some L1 instruction cache metrics. diff --git a/collectors/likwid/groups/arm8/L2.txt b/collectors/likwid/groups/arm8/L2.txt new file mode 100644 index 0000000..9f0c2e4 --- /dev/null +++ b/collectors/likwid/groups/arm8/L2.txt @@ -0,0 +1,40 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 L1D_CACHE_REFILL +PMC3 L1D_CACHE_WB +PMC4 L1I_CACHE_REFILL + + +METRICS +Runtime (RDTSC) [s] time +CPI PMC1/PMC0 +L2D load bandwidth [MBytes/s] 1.0E-06*PMC2*64.0/time +L2D load data volume [GBytes] 1.0E-09*PMC2*64.0 +L2D evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time +L2D evict data volume [GBytes] 1.0E-09*PMC3*64.0 +L2I load bandwidth [MBytes/s] 1.0E-06*PMC4*64.0/time +L2I load data volume [GBytes] 1.0E-09*PMC4*64.0 +L2 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3+PMC4)*64.0/time +L2 data volume [GBytes] 1.0E-09*(PMC2+PMC3+PMC4)*64.0 + +LONG +Formulas: +CPI = CPU_CYCLES/INST_RETIRED +L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_REFILL*64.0/time +L2D load data volume [GBytes] = 1.0E-09*L1D_CACHE_REFILL*64.0 +L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_WB*64.0/time +L2D evict data volume [GBytes] = 1.0E-09*L1D_CACHE_WB*64.0 +L2I load bandwidth [MBytes/s] = 1.0E-06*L1I_CACHE_REFILL*64.0/time +L2I load data volume [GBytes] = 1.0E-09*L1I_CACHE_REFILL*64.0 +L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0/time +L2 data volume [GBytes] = 1.0E-09*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0 +- +Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the +number of cacheline loaded from the L2 to the L1 data cache and the writebacks from +the L1 data cache to the L2 cache. The group also outputs total data volume transfered between +L2 and L1. Note that this bandwidth also includes data transfers due to a write +allocate load on a store miss in L1 and cachelines transfered it the instruction +cache. diff --git a/collectors/likwid/groups/arm8/MEM.txt b/collectors/likwid/groups/arm8/MEM.txt new file mode 100644 index 0000000..d383916 --- /dev/null +++ b/collectors/likwid/groups/arm8/MEM.txt @@ -0,0 +1,30 @@ +SHORT Main memory bandwidth in MBytes/s + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 L2D_CACHE_REFILL +PMC3 L2D_CACHE_WB + + +METRICS +Runtime (RDTSC) [s] time +CPI PMC1/PMC0 +Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(PMC2)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(PMC3)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time +Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0 + +LONG +Formulas: +Memory read bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL)*64.0/runtime +Memory read data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL)*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_WB)*64.0/runtime +Memory write data volume [GBytes] = 1.0E-09*(L2D_CACHE_WB)*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL+L2D_CACHE_WB)*64.0/runtime +Memory data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL+L2D_CACHE_WB)*64.0 +- +Profiling group to measure memory bandwidth as initiated by the L2 cache. + diff --git a/collectors/likwid/groups/arm8_n1/BRANCH.txt b/collectors/likwid/groups/arm8_n1/BRANCH.txt new file mode 100644 index 0000000..8cd4f00 --- /dev/null +++ b/collectors/likwid/groups/arm8_n1/BRANCH.txt @@ -0,0 +1,31 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 BR_PRED +PMC3 BR_MIS_PRED +PMC4 INST_SPEC + + +METRICS +Runtime (RDTSC) [s] time +CPI PMC1/PMC0 +Branch rate PMC2/PMC0 +Branch misprediction rate PMC3/PMC0 +Branch misprediction ratio PMC3/(PMC2+PMC3) +Instructions per branch PMC0/(PMC2+PMC3) + +LONG +Formulas: +CPI = CPU_CYCLES/INST_RETIRED +Branch rate = BR_PRED/INST_RETIRED +Branch misprediction rate = BR_MIS_PRED/INST_RETIRED +Branch misprediction ratio = BR_MIS_PRED/(BR_PRED+BR_MIS_PRED) +Instructions per branch = INSTR_RETIRED_ANY/(BR_PRED+BR_MIS_PRED) +- +The rates state how often in average a branch or a mispredicted branch occured +per instruction retired in total. The Branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/Branch rate. + diff --git a/collectors/likwid/groups/arm8_n1/CLOCK.txt b/collectors/likwid/groups/arm8_n1/CLOCK.txt new file mode 100644 index 0000000..ad7303a --- /dev/null +++ b/collectors/likwid/groups/arm8_n1/CLOCK.txt @@ -0,0 +1,16 @@ +SHORT Cycles and instructions + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES + +METRICS +Runtime (RDTSC) [s] time +CPI PMC1/PMC0 + +LONG +Formulas: +CPI = CPU_CYCLES/INST_RETIRED +- +This is a metric to determine cycles per instruction. + diff --git a/collectors/likwid/groups/arm8_n1/DATA.txt b/collectors/likwid/groups/arm8_n1/DATA.txt new file mode 100644 index 0000000..d2221a8 --- /dev/null +++ b/collectors/likwid/groups/arm8_n1/DATA.txt @@ -0,0 +1,24 @@ +SHORT Load to store ratio + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 LD_SPEC +PMC3 ST_SPEC + +METRICS +Runtime (RDTSC) [s] time +CPI PMC1/PMC0 +Load to store ratio PMC2/PMC3 +Load ratio PMC2/PMC0 +Store ratio PMC3/PMC0 + +LONG +Formulas: +CPI = CPU_CYCLES/INST_RETIRED +Load to store ratio = LD_SPEC / ST_SPEC +Load ratio = LD_SPEC / INST_SPEC +Store ratio = ST_SPEC / INST_SPEC +- +This is a metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/arm8_n1/ICACHE.txt b/collectors/likwid/groups/arm8_n1/ICACHE.txt new file mode 100644 index 0000000..6a0bbea --- /dev/null +++ b/collectors/likwid/groups/arm8_n1/ICACHE.txt @@ -0,0 +1,24 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 L1I_CACHE +PMC3 L1I_CACHE_REFILL + + +METRICS +Runtime (RDTSC) [s] time +CPI PMC1/PMC0 +L1I request rate PMC2/PMC0 +L1I miss rate PMC3/PMC0 +L1I miss ratio PMC3/PMC2 + +LONG +Formulas: +CPI = CPU_CYCLES/INST_RETIRED +L1I request rate = L1I_CACHE / INST_RETIRED +L1I miss rate = L1I_CACHE_REFILL / INST_RETIRED +L1I miss ratio = L1I_CACHE_REFILL / L1I_CACHE +- +This group measures some L1 instruction cache metrics. diff --git a/collectors/likwid/groups/arm8_n1/L2.txt b/collectors/likwid/groups/arm8_n1/L2.txt new file mode 100644 index 0000000..9f0c2e4 --- /dev/null +++ b/collectors/likwid/groups/arm8_n1/L2.txt @@ -0,0 +1,40 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 L1D_CACHE_REFILL +PMC3 L1D_CACHE_WB +PMC4 L1I_CACHE_REFILL + + +METRICS +Runtime (RDTSC) [s] time +CPI PMC1/PMC0 +L2D load bandwidth [MBytes/s] 1.0E-06*PMC2*64.0/time +L2D load data volume [GBytes] 1.0E-09*PMC2*64.0 +L2D evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time +L2D evict data volume [GBytes] 1.0E-09*PMC3*64.0 +L2I load bandwidth [MBytes/s] 1.0E-06*PMC4*64.0/time +L2I load data volume [GBytes] 1.0E-09*PMC4*64.0 +L2 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3+PMC4)*64.0/time +L2 data volume [GBytes] 1.0E-09*(PMC2+PMC3+PMC4)*64.0 + +LONG +Formulas: +CPI = CPU_CYCLES/INST_RETIRED +L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_REFILL*64.0/time +L2D load data volume [GBytes] = 1.0E-09*L1D_CACHE_REFILL*64.0 +L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_WB*64.0/time +L2D evict data volume [GBytes] = 1.0E-09*L1D_CACHE_WB*64.0 +L2I load bandwidth [MBytes/s] = 1.0E-06*L1I_CACHE_REFILL*64.0/time +L2I load data volume [GBytes] = 1.0E-09*L1I_CACHE_REFILL*64.0 +L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0/time +L2 data volume [GBytes] = 1.0E-09*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0 +- +Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the +number of cacheline loaded from the L2 to the L1 data cache and the writebacks from +the L1 data cache to the L2 cache. The group also outputs total data volume transfered between +L2 and L1. Note that this bandwidth also includes data transfers due to a write +allocate load on a store miss in L1 and cachelines transfered it the instruction +cache. diff --git a/collectors/likwid/groups/arm8_n1/L3.txt b/collectors/likwid/groups/arm8_n1/L3.txt new file mode 100644 index 0000000..3c8a73e --- /dev/null +++ b/collectors/likwid/groups/arm8_n1/L3.txt @@ -0,0 +1,30 @@ +SHORT L3 cache bandwidth in MBytes/s + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 L2D_CACHE_REFILL +PMC3 L2D_CACHE_WB + + +METRICS +Runtime (RDTSC) [s] time +CPI PMC1/PMC0 +L3 read bandwidth [MBytes/s] 1.0E-06*(PMC2)*64.0/time +L3 read data volume [GBytes] 1.0E-09*(PMC2)*64.0 +L3 write bandwidth [MBytes/s] 1.0E-06*(PMC3)*64.0/time +L3 write data volume [GBytes] 1.0E-09*(PMC3)*64.0 +L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time +L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0 + +LONG +Formulas: +L3 read bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL)*64.0/runtime +L3 read data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL)*64.0 +L3 write bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_WB)*64.0/runtime +L3 write data volume [GBytes] = 1.0E-09*(L2D_CACHE_WB)*64.0 +L3 bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL+L2D_CACHE_WB)*64.0/runtime +L3 data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL+L2D_CACHE_WB)*64.0 +- +Profiling group to measure traffic between L2 and L3 cache. + diff --git a/collectors/likwid/groups/arm8_n1/MEM.txt b/collectors/likwid/groups/arm8_n1/MEM.txt new file mode 100644 index 0000000..8c334bb --- /dev/null +++ b/collectors/likwid/groups/arm8_n1/MEM.txt @@ -0,0 +1,29 @@ +SHORT Main memory bandwidth in MBytes/s + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 MEM_ACCESS_RD +PMC3 MEM_ACCESS_WR + +METRICS +Runtime (RDTSC) [s] time +CPI PMC1/PMC0 +Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(PMC2)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(PMC3)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time +Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0 + +LONG +Formulas: +Memory read bandwidth [MBytes/s] = 1.0E-06*(MEM_ACCESS_RD)*64.0/runtime +Memory read data volume [GBytes] = 1.0E-09*(MEM_ACCESS_RD)*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(MEM_ACCESS_WR)*64.0/runtime +Memory write data volume [GBytes] = 1.0E-09*(MEM_ACCESS_WR)*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(MEM_ACCESS_RD+MEM_ACCESS_WR)*64.0/runtime +Memory data volume [GBytes] = 1.0E-09*(MEM_ACCESS_RD+MEM_ACCESS_WR)*64.0 +- +Profiling group to measure memory bandwidth + diff --git a/collectors/likwid/groups/arm8_n1/TLB.txt b/collectors/likwid/groups/arm8_n1/TLB.txt new file mode 100644 index 0000000..4e588b1 --- /dev/null +++ b/collectors/likwid/groups/arm8_n1/TLB.txt @@ -0,0 +1,30 @@ +SHORT L1/L2 TLB information + +EVENTSET +PMC0 L1D_TLB +PMC1 L1I_TLB +PMC2 L2D_TLB +PMC3 L1D_TLB_REFILL +PMC4 L1I_TLB_REFILL +PMC5 L2D_TLB_REFILL + +METRICS +Runtime (RDTSC) [s] time +L1 DTLB accesses PMC0 +L1 ITLB accesses PMC1 +L2 DTLB accesses PMC2 +L1 DTLB refills PMC3 +L1 ITLB refills PMC4 +L2 DTLB refills PMC5 +L1 DTLB refill ratio PMC3/PMC0 +L1 ITLB refill ratio PMC4/PMC1 +L1 DTLB refill ratio PMC5/PMC2 + +LONG +Formulas: +L1 DTLB refill ratio = L1D_TLB_REFILL / L1D_TLB +L1 ITLB refill ratio = L1I_TLB_REFILL / L1I_TLB +L2 DTLB refill ratio = L2D_TLB_REFILL / L2D_TLB +- +This group gives information about the TLB usage for all TLBs: +L1 data, L1 instruction and L2 data. diff --git a/collectors/likwid/groups/arm8_tx2/BRANCH.txt b/collectors/likwid/groups/arm8_tx2/BRANCH.txt new file mode 100644 index 0000000..db0fa40 --- /dev/null +++ b/collectors/likwid/groups/arm8_tx2/BRANCH.txt @@ -0,0 +1,32 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 BR_PRED +PMC3 BR_MIS_PRED +PMC4 INST_SPEC + + +METRICS +Runtime (RDTSC) [s] time +Clock [MHz] 1.E-06*PMC1/time +CPI PMC1/PMC0 +Branch rate PMC2/PMC0 +Branch misprediction rate PMC3/PMC0 +Branch misprediction ratio PMC3/(PMC2+PMC3) +Instructions per branch PMC0/(PMC2+PMC3) + +LONG +Formulas: +CPI = CPU_CYCLES/INST_RETIRED +Branch rate = BR_PRED/INST_RETIRED +Branch misprediction rate = BR_MIS_PRED/INST_RETIRED +Branch misprediction ratio = BR_MIS_PRED/(BR_PRED+BR_MIS_PRED) +Instructions per branch = INSTR_RETIRED_ANY/(BR_PRED+BR_MIS_PRED) +- +The rates state how often in average a branch or a mispredicted branch occured +per instruction retired in total. The Branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/Branch rate. + diff --git a/collectors/likwid/groups/arm8_tx2/DATA.txt b/collectors/likwid/groups/arm8_tx2/DATA.txt new file mode 100644 index 0000000..09681c2 --- /dev/null +++ b/collectors/likwid/groups/arm8_tx2/DATA.txt @@ -0,0 +1,25 @@ +SHORT Load to store ratio + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 LD_RETIRED +PMC3 ST_RETIRED + +METRICS +Runtime (RDTSC) [s] time +Clock [MHz] 1.E-06*PMC1/time +CPI PMC1/PMC0 +Load to store ratio PMC2/PMC3 +Load ratio PMC2/PMC0 +Store ratio PMC3/PMC0 + +LONG +Formulas: +CPI = CPU_CYCLES/INST_RETIRED +Load to store ratio = LD_RETIRED / ST_RETIRED +Load ratio = LD_RETIRED / INST_RETIRED +Store ratio = ST_RETIRED / INST_RETIRED +- +This is a metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/arm8_tx2/FLOPS_DP.txt b/collectors/likwid/groups/arm8_tx2/FLOPS_DP.txt new file mode 100644 index 0000000..5b477de --- /dev/null +++ b/collectors/likwid/groups/arm8_tx2/FLOPS_DP.txt @@ -0,0 +1,28 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 VFP_SPEC +PMC3 ASE_SPEC + +METRICS +Runtime (RDTSC) [s] time +Clock [MHz] 1.E-06*PMC1/time +CPI PMC1/PMC0 +DP [MFLOP/s] 1.0E-06*(PMC3*2.0+PMC2)/time +NEON DP [MFLOP/s] 1.0E-06*(PMC3*2.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC3)/time +Scalar [MUOPS/s] 1.0E-06*PMC2/time +Vectorization ratio 100*(PMC3)/(PMC2+PMC3) + +LONG +Formulas: +DP [MFLOP/s] = 1.0E-06*(ASE_SPEC*2+VFP_SPEC)/runtime +NEON DP [MFLOP/s] = 1.0E-06*(ASE_SPEC*4)/runtime +Packed [MUOPS/s] = 1.0E-06*(ASE_SPEC)/runtime +Scalar [MUOPS/s] = 1.0E-06*VFP_SPEC/runtime +Vectorization ratio = 100*(ASE_SPEC)/(ASE_SPEC+VFP_SPEC) +- +NEON scalar and packed double precision FLOP rates. + diff --git a/collectors/likwid/groups/arm8_tx2/FLOPS_SP.txt b/collectors/likwid/groups/arm8_tx2/FLOPS_SP.txt new file mode 100644 index 0000000..9857308 --- /dev/null +++ b/collectors/likwid/groups/arm8_tx2/FLOPS_SP.txt @@ -0,0 +1,28 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 VFP_SPEC +PMC3 ASE_SPEC + +METRICS +Runtime (RDTSC) [s] time +Clock [MHz] 1.E-06*PMC1/time +CPI PMC1/PMC0 +SP [MFLOP/s] 1.0E-06*(PMC3*2.0+PMC2)/time +NEON SP [MFLOP/s] 1.0E-06*(PMC3*2.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC3)/time +Scalar [MUOPS/s] 1.0E-06*PMC2/time +Vectorization ratio 100*(PMC3)/(PMC2+PMC3) + +LONG +Formulas: +SP [MFLOP/s] = 1.0E-06*(ASE_SPEC*2+VFP_SPEC)/runtime +NEON SP [MFLOP/s] = 1.0E-06*(ASE_SPEC*4)/runtime +Packed [MUOPS/s] = 1.0E-06*(ASE_SPEC)/runtime +Scalar [MUOPS/s] = 1.0E-06*VFP_SPEC/runtime +Vectorization ratio = 100*(ASE_SPEC)/(ASE_SPEC+VFP_SPEC) +- +NEON scalar and packed single precision FLOP rates. + diff --git a/collectors/likwid/groups/arm8_tx2/ICACHE.txt b/collectors/likwid/groups/arm8_tx2/ICACHE.txt new file mode 100644 index 0000000..fbaf3be --- /dev/null +++ b/collectors/likwid/groups/arm8_tx2/ICACHE.txt @@ -0,0 +1,23 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 L1I_CACHE +PMC3 L1I_CACHE_REFILL + +METRICS +Runtime (RDTSC) [s] time +Clock [MHz] 1.E-06*PMC1/time +CPI PMC1/PMC0 +L1I request rate PMC2/PMC0 +L1I miss rate PMC3/PMC0 +L1I miss ratio PMC3/PMC2 + +LONG +Formulas: +L1I request rate = L1I_CACHE / INST_RETIRED +L1I miss rate = L1I_CACHE_REFILL / INST_RETIRED +L1I miss ratio = L1I_CACHE_REFILL / L1I_CACHE +- +This group measures some L1 instruction cache metrics. diff --git a/collectors/likwid/groups/arm8_tx2/L2.txt b/collectors/likwid/groups/arm8_tx2/L2.txt new file mode 100644 index 0000000..53bec4c --- /dev/null +++ b/collectors/likwid/groups/arm8_tx2/L2.txt @@ -0,0 +1,41 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 L1D_CACHE_REFILL +PMC3 L1D_CACHE_WB +PMC4 L1I_CACHE_REFILL + + +METRICS +Runtime (RDTSC) [s] time +Clock [MHz] 1.E-06*PMC1/time +CPI PMC1/PMC0 +L2D load bandwidth [MBytes/s] 1.0E-06*PMC2*64.0/time +L2D load data volume [GBytes] 1.0E-09*PMC2*64.0 +L2D evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time +L2D evict data volume [GBytes] 1.0E-09*PMC3*64.0 +L2I load bandwidth [MBytes/s] 1.0E-06*PMC4*64.0/time +L2I load data volume [GBytes] 1.0E-09*PMC4*64.0 +L2 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3+PMC4)*64.0/time +L2 data volume [GBytes] 1.0E-09*(PMC2+PMC3+PMC4)*64.0 + +LONG +Formulas: +CPI = CPU_CYCLES/INST_RETIRED +L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_REFILL*64.0/time +L2D load data volume [GBytes] = 1.0E-09*L1D_CACHE_REFILL*64.0 +L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_WB*64.0/time +L2D evict data volume [GBytes] = 1.0E-09*L1D_CACHE_WB*64.0 +L2I load bandwidth [MBytes/s] = 1.0E-06*L1I_CACHE_REFILL*64.0/time +L2I load data volume [GBytes] = 1.0E-09*L1I_CACHE_REFILL*64.0 +L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0/time +L2 data volume [GBytes] = 1.0E-09*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0 +- +Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the +number of cacheline loaded from the L2 to the L1 data cache and the writebacks from +the L1 data cache to the L2 cache. The group also outputs total data volume transfered between +L2 and L1. Note that this bandwidth also includes data transfers due to a write +allocate load on a store miss in L1 and cachelines transfered it the instruction +cache. diff --git a/collectors/likwid/groups/arm8_tx2/L2CACHE.txt b/collectors/likwid/groups/arm8_tx2/L2CACHE.txt new file mode 100644 index 0000000..4696e28 --- /dev/null +++ b/collectors/likwid/groups/arm8_tx2/L2CACHE.txt @@ -0,0 +1,32 @@ +SHORT L2 cache miss rate/ratio + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 L2D_CACHE +PMC3 L2D_CACHE_REFILL + +METRICS +Runtime (RDTSC) [s] time +Clock [MHz] 1.E-06*PMC1/time +CPI PMC1/PMC0 +L2 request rate PMC2/PMC0 +L2 miss rate PMC3/PMC0 +L2 miss ratio PMC3/PMC2 + +LONG +Formulas: +L2 request rate = L2D_CACHE/INST_RETIRED +L2 miss rate = L2D_CACHE_REFILL/INST_RETIRED +L2 miss ratio = L2D_CACHE_REFILL/L2D_CACHE +- +This group measures the locality of your data accesses with regard to the +L2 cache. L2 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L2 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L2 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/arm8_tx2/L3.txt b/collectors/likwid/groups/arm8_tx2/L3.txt new file mode 100644 index 0000000..4c99a05 --- /dev/null +++ b/collectors/likwid/groups/arm8_tx2/L3.txt @@ -0,0 +1,38 @@ +SHORT L3 cache bandwidth in MBytes/s + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 L2D_CACHE_REFILL +PMC3 L2D_CACHE_WB +PMC4 L2D_CACHE_ALLOCATE + + +METRICS +Runtime (RDTSC) [s] time +Clock [MHz] 1.E-06*PMC1/time +CPI PMC1/PMC0 +L3 load bandwidth [MBytes/s] 1.0E-06*(PMC2-PMC4)*64.0/time +L3 load data volume [GBytes] 1.0E-09*(PMC2-PMC4)*64.0 +L3 evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time +L3 evict data volume [GBytes] 1.0E-09*PMC3*64.0 +L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3-PMC4)*64.0/time +L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3-PMC4)*64.0 + +LONG +Formulas: +CPI = CPU_CYCLES/INST_RETIRED +L3 load bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL-L2D_CACHE_ALLOCATE)*64.0/time +L3 load data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL-L2D_CACHE_ALLOCATE)*64.0 +L3 evict bandwidth [MBytes/s] = 1.0E-06*L2D_CACHE_WB*64.0/time +L3 evict data volume [GBytes] = 1.0E-09*L2D_CACHE_WB*64.0 +L3 bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL+L2D_CACHE_WB-L2D_CACHE_ALLOCATE))*64.0/time +L3 data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL+L2D_CACHE_WB-L2D_CACHE_ALLOCATE))*64.0 +- +Profiling group to measure L2 <-> L3 cache bandwidth. The bandwidth is computed by the +number of cache lines loaded from the L3 to the L2 data cache and the writebacks from +the L2 data cache to the L3 cache. The group also outputs total data volume transfered between +L3 and L2. For streaming-stores, the cache lines are allocated in L2, consequently there +is no traffic between L3 and L2 in this case. But the L2D_CACHE_REFILL event counts these +allocated cache lines, that's why the value of L2D_CACHE_REFILL is reduced +by L2D_CACHE_ALLOCATE. diff --git a/collectors/likwid/groups/arm8_tx2/MEM.txt b/collectors/likwid/groups/arm8_tx2/MEM.txt new file mode 100644 index 0000000..06bc697 --- /dev/null +++ b/collectors/likwid/groups/arm8_tx2/MEM.txt @@ -0,0 +1,32 @@ +SHORT Main memory bandwidth in MBytes/s + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +MBOX0C0 MEMORY_READS +MBOX0C1 MEMORY_WRITES +MBOX1C0 MEMORY_READS +MBOX1C1 MEMORY_WRITES + +METRICS +Runtime (RDTSC) [s] time +Clock [MHz] 1.E-06*PMC1/time +CPI PMC1/PMC0 +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1)*64.0 + +LONG +Formulas: +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MEMORY_READS))*64.0/runtime +Memory read data volume [GBytes] = 1.0E-09*(SUM(MEMORY_READS))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MEMORY_WRITES))*64.0/runtime +Memory write data volume [GBytes] = 1.0E-09*(SUM(MEMORY_WRITES))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MEMORY_READS)+SUM(MEMORY_WRITES))*64.0/runtime +Memory data volume [GBytes] = 1.0E-09*(SUM(MEMORY_READS)+SUM(MEMORY_WRITES))*64.0 +- +Profiling group to measure memory bandwidth. It uses the performance monitoring +hardware of the memory controllers. diff --git a/collectors/likwid/groups/arm8_tx2/SPEC.txt b/collectors/likwid/groups/arm8_tx2/SPEC.txt new file mode 100644 index 0000000..7561d3a --- /dev/null +++ b/collectors/likwid/groups/arm8_tx2/SPEC.txt @@ -0,0 +1,44 @@ +SHORT Information about speculative execution + +EVENTSET +PMC0 INST_SPEC +PMC1 LD_SPEC +PMC2 ST_SPEC +PMC3 DP_SPEC +PMC4 VFP_SPEC +PMC5 ASE_SPEC + + +METRICS +Runtime (RDTSC) [s] time +Operations spec. executed PMC0 +Load ops spec. executed PMC1 +Store ops spec. executed PMC2 +Integer data ops spec. executed PMC3 +Scalar FP ops spec. executed PMC4 +Vector FP ops spec. executed PMC5 +Other ops spec. executed (PMC0-PMC1-PMC2-PMC3-PMC4-PMC5) +Load ops spec. ratio PMC1/PMC0 +Store ops spec. ratio PMC2/PMC0 +Integer data ops spec. ratio PMC3/PMC0 +Scalar FP ops spec. ratio PMC4/PMC0 +Vector FP ops spec. ratio PMC5/PMC0 +Other ops spec. ratio (PMC0-PMC1-PMC2-PMC3-PMC4-PMC5)/PMC0 + + + + +LONG +Formulas: +Load ops spec. ratio = LD_SPEC / INST_SPEC +Store ops spec. ratio = ST_SPEC / INST_SPEC +Integer data ops spec. ratio = DP_SPEC / INST_SPEC +Scalar FP ops spec. ratio = VFP_SPEC / INST_SPEC +Vector FP ops spec. ratio = ASE_SPEC / INST_SPEC +Other ops spec. ratio = (INST_SPEC-LD_SPEC-ST_SPEC-DP_SPEC-VFP_SPEC-ASE_SPEC) / INST_SPEC +- +This group gives information about the speculative execution of micro-ops. +It is currently unclear why Other ops spec. executed and ratio is negative +in some cases. Although the documentation contains an OP_RETIRED, there is no +equivalent OP_SPEC which could be a better reference in this group instead of +INST_SPEC. diff --git a/collectors/likwid/groups/arm8_tx2/TLB_DATA.txt b/collectors/likwid/groups/arm8_tx2/TLB_DATA.txt new file mode 100644 index 0000000..054b0ec --- /dev/null +++ b/collectors/likwid/groups/arm8_tx2/TLB_DATA.txt @@ -0,0 +1,27 @@ +SHORT L1 data TLB miss rate/ratio + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 L1D_TLB_REFILL_RD +PMC3 L1D_TLB_REFILL_WR + +METRICS +Runtime (RDTSC) [s] time +Clock [MHz] 1.E-06*PMC1/time +CPI PMC1/PMC0 +L1 DTLB load misses PMC2 +L1 DTLB load miss rate PMC2/PMC0 +L1 DTLB store misses PMC3 +L1 DTLB store miss rate PMC3/PMC0 + +LONG +Formulas: +L1 DTLB load misses = L1D_TLB_REFILL_RD +L1 DTLB load miss rate = L1D_TLB_REFILL_RD / INST_RETIRED +L1 DTLB store misses = L1D_TLB_REFILL_WR +L1 DTLB store miss rate = L1D_TLB_REFILL_WR / INST_RETIRED +- +The DTLB load and store miss rates gives a measure how often a TLB miss occurred +per instruction. + diff --git a/collectors/likwid/groups/arm8_tx2/TLB_INSTR.txt b/collectors/likwid/groups/arm8_tx2/TLB_INSTR.txt new file mode 100644 index 0000000..c1111c8 --- /dev/null +++ b/collectors/likwid/groups/arm8_tx2/TLB_INSTR.txt @@ -0,0 +1,23 @@ +SHORT L1 Instruction TLB miss rate/ratio + +EVENTSET +PMC0 INST_RETIRED +PMC1 CPU_CYCLES +PMC2 L1I_TLB_REFILL + +METRICS +Runtime (RDTSC) [s] time +Clock [MHz] 1.E-06*PMC1/time +CPI PMC1/PMC0 +L1 ITLB misses PMC2 +L1 ITLB miss rate PMC2/PMC0 + + +LONG +Formulas: +L1 ITLB misses = L1I_TLB_REFILL +L1 ITLB miss rate = L1I_TLB_REFILL / INST_RETIRED +- +The ITLB miss rates gives a measure how often a TLB miss occurred +per instruction. + diff --git a/collectors/likwid/groups/atom/BRANCH.txt b/collectors/likwid/groups/atom/BRANCH.txt new file mode 100644 index 0000000..7b2bb20 --- /dev/null +++ b/collectors/likwid/groups/atom/BRANCH.txt @@ -0,0 +1,29 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +PMC0 BR_INST_RETIRED_ANY +PMC1 BR_INST_RETIRED_MISPRED + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +CPI FIXC1/FIXC0 +Branch rate PMC0/FIXC0 +Branch misprediction rate PMC1/FIXC0 +Branch misprediction ratio PMC1/PMC0 +Instructions per branch FIXC0/PMC0 + +LONG +Formulas: +Branch rate = BR_INST_RETIRED_ANY/INSTR_RETIRED_ANY +Branch misprediction rate = BR_INST_RETIRED_MISPRED/INSTR_RETIRED_ANY +Branch misprediction ratio = BR_INST_RETIRED_MISPRED/BR_INST_RETIRED_ANY +Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ANY +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/atom/DATA.txt b/collectors/likwid/groups/atom/DATA.txt new file mode 100644 index 0000000..b2d007f --- /dev/null +++ b/collectors/likwid/groups/atom/DATA.txt @@ -0,0 +1,20 @@ +SHORT Load to store ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +PMC0 L1D_CACHE_LD +PMC1 L1D_CACHE_ST + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +CPI FIXC1/FIXC0 +Load to store ratio PMC0/PMC1 + +LONG +Formulas: +Load to store ratio = L1D_CACHE_LD/L1D_CACHE_ST +- +This is a simple metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/atom/FLOPS_DP.txt b/collectors/likwid/groups/atom/FLOPS_DP.txt new file mode 100644 index 0000000..53b2d02 --- /dev/null +++ b/collectors/likwid/groups/atom/FLOPS_DP.txt @@ -0,0 +1,25 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +PMC0 SIMD_COMP_INST_RETIRED_PACKED_DOUBLE +PMC1 SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +CPI FIXC1/FIXC0 +DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1)/time +Packed [MUOPS/s] 1.0E-06*PMC0/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time + + +LONG +Formulas: +DP [MFLOP/s] = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_DOUBLE*2.0+SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE)/runtime +Packed [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_PACKED_DOUBLE/runtime +Scalar [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE/runtime +-- +Double Precision [MFLOP/s] Double Precision MFLOP/s + diff --git a/collectors/likwid/groups/atom/FLOPS_SP.txt b/collectors/likwid/groups/atom/FLOPS_SP.txt new file mode 100644 index 0000000..0046d5b --- /dev/null +++ b/collectors/likwid/groups/atom/FLOPS_SP.txt @@ -0,0 +1,24 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +PMC0 SIMD_COMP_INST_RETIRED_PACKED_SINGLE +PMC1 SIMD_COMP_INST_RETIRED_SCALAR_SINGLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +CPI FIXC1/FIXC0 +SP [MFLOP/s] (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time +Packed [MUOPS/s] 1.0E-06*(PMC0)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time + +LONG +Formulas: +SP [MFLOP/s] = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_DOUBLE*4.0+SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE)/runtime +Packed [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_PACKED_SINGLE/runtime +Scalar [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_SCALAR_SINGLE/runtime +-- +Single Precision MFLOP/s Double Precision MFLOP/s + diff --git a/collectors/likwid/groups/atom/FLOPS_X87.txt b/collectors/likwid/groups/atom/FLOPS_X87.txt new file mode 100644 index 0000000..58c5d42 --- /dev/null +++ b/collectors/likwid/groups/atom/FLOPS_X87.txt @@ -0,0 +1,19 @@ +SHORT X87 MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +PMC0 X87_COMP_OPS_EXE_ANY_AR + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +CPI FIXC1/FIXC0 +X87 [MFLOP/s] 1.0E-06*PMC0/time + +LONG +Formulas: +X87 [MFLOP/s] = 1.0E-06*X87_COMP_OPS_EXE_ANY_AR/runtime +-- +The MFLOP/s made with X87 instructions + diff --git a/collectors/likwid/groups/atom/MEM.txt b/collectors/likwid/groups/atom/MEM.txt new file mode 100644 index 0000000..355b7fd --- /dev/null +++ b/collectors/likwid/groups/atom/MEM.txt @@ -0,0 +1,21 @@ +SHORT Main memory bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +PMC0 BUS_TRANS_MEM_THIS_CORE_THIS_A + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +CPI FIXC1/FIXC0 +Memory bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +Memory data volume [GBytes] 1.0E-09*PMC0*64.0 + +LONG +Formulas: +Memory bandwidth [MBytes/s] = 1.0E-06*BUS_TRANS_MEM_THIS_CORE_THIS_A*64/time +Memory data volume [GBytes] = 1.0E-09*BUS_TRANS_MEM_THIS_CORE_THIS_A*64.0 +- +Profiling group to measure memory bandwidth drawn by this core. + diff --git a/collectors/likwid/groups/atom/TLB.txt b/collectors/likwid/groups/atom/TLB.txt new file mode 100644 index 0000000..5d0aa1b --- /dev/null +++ b/collectors/likwid/groups/atom/TLB.txt @@ -0,0 +1,21 @@ +SHORT TLB miss rate + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +PMC0 DATA_TLB_MISSES_DTLB_MISS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +CPI FIXC1/FIXC0 +DTLB misses PMC0 +DTLB miss rate PMC0/FIXC0 + +LONG +Formulas: +DTLB misses = DATA_TLB_MISSES_DTLB_MISS +DTLB miss rate = DATA_TLB_MISSES_DTLB_MISS/INSTR_RETIRED_ANY +-- +The DTLB miss rate gives a measure how often a TLB miss occurred per instruction. + diff --git a/collectors/likwid/groups/broadwell/BRANCH.txt b/collectors/likwid/groups/broadwell/BRANCH.txt new file mode 100644 index 0000000..b8d41b2 --- /dev/null +++ b/collectors/likwid/groups/broadwell/BRANCH.txt @@ -0,0 +1,31 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 BR_INST_RETIRED_ALL_BRANCHES +PMC1 BR_MISP_RETIRED_ALL_BRANCHES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Branch rate PMC0/FIXC0 +Branch misprediction rate PMC1/FIXC0 +Branch misprediction ratio PMC1/PMC0 +Instructions per branch FIXC0/PMC0 + +LONG +Formulas: +Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES +Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/broadwell/CLOCK.txt b/collectors/likwid/groups/broadwell/CLOCK.txt new file mode 100644 index 0000000..b81bee6 --- /dev/null +++ b/collectors/likwid/groups/broadwell/CLOCK.txt @@ -0,0 +1,26 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY +UBOXFIX UNCORE_CLOCK + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +Uncore Clock [MHz] 1.E-06*UBOXFIX/time +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time +- +Broadwell implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) level. + diff --git a/collectors/likwid/groups/broadwell/CYCLE_ACTIVITY.txt b/collectors/likwid/groups/broadwell/CYCLE_ACTIVITY.txt new file mode 100644 index 0000000..c432a44 --- /dev/null +++ b/collectors/likwid/groups/broadwell/CYCLE_ACTIVITY.txt @@ -0,0 +1,38 @@ +SHORT Cycle Activities + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING +PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING +PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING +PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Cycles without execution [%] (PMC3/FIXC1)*100 +Cycles without execution due to L1D [%] (PMC2/FIXC1)*100 +Cycles without execution due to L2 [%] (PMC0/FIXC1)*100 +Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100 + +LONG +Formulas: +Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100 +Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100 +Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100 +Cycles without execution due to memory loads [%] = CYCLE_ACTIVITY_CYCLES_LDM_PENDING/CPU_CLK_UNHALTED_CORE*100 +-- +This performance group measures the cycles while waiting for data from the cache +and memory hierarchy. +CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on +any execution port. +CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is +outstanding. +CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is +outstanding. +CYCLE_ACTIVITY_CYCLES_LDM_PENDING: Cycles while memory subsystem has an +outstanding load. diff --git a/collectors/likwid/groups/broadwell/CYCLE_STALLS.txt b/collectors/likwid/groups/broadwell/CYCLE_STALLS.txt new file mode 100644 index 0000000..795aeb9 --- /dev/null +++ b/collectors/likwid/groups/broadwell/CYCLE_STALLS.txt @@ -0,0 +1,45 @@ +SHORT Cycle Activities (Stalls) + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING +PMC1 CYCLE_ACTIVITY_STALLS_LDM_PENDING +PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING +PMC3 CYCLE_ACTIVITY_STALLS_TOTAL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Total execution stalls PMC3 +Stalls caused by L1D misses [%] (PMC2/PMC3)*100 +Stalls caused by L2 misses [%] (PMC0/PMC3)*100 +Stalls caused by memory loads [%] (PMC1/PMC3)*100 +Execution stall rate [%] (PMC3/FIXC1)*100 +Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100 +Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100 +Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100 + +LONG +Formulas: +Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL +Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Stalls caused by memory loads [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by memory loads rate [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CPU_CLK_UNHALTED_CORE)*100 +-- +This performance group measures the stalls caused by data traffic in the cache +hierarchy. +CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls. +CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand +load is outstanding. +CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand +load is outstanding. +CYCLE_ACTIVITY_STALLS_LDM_PENDING: Execution stalls while memory subsystem has +an outstanding load. diff --git a/collectors/likwid/groups/broadwell/DATA.txt b/collectors/likwid/groups/broadwell/DATA.txt new file mode 100644 index 0000000..6955eb7 --- /dev/null +++ b/collectors/likwid/groups/broadwell/DATA.txt @@ -0,0 +1,22 @@ +SHORT Load to store ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_UOPS_RETIRED_LOADS_ALL +PMC1 MEM_UOPS_RETIRED_STORES_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Load to store ratio PMC0/PMC1 + +LONG +Formulas: +Load to store ratio = MEM_UOPS_RETIRED_LOADS_ALL/MEM_UOPS_RETIRED_STORES_ALL +- +This is a metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/broadwell/DIVIDE.txt b/collectors/likwid/groups/broadwell/DIVIDE.txt new file mode 100644 index 0000000..c7c5fb2 --- /dev/null +++ b/collectors/likwid/groups/broadwell/DIVIDE.txt @@ -0,0 +1,24 @@ +SHORT Divide unit information + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0:EDGEDETECT ARITH_FPU_DIV_ACTIVE +PMC1 ARITH_FPU_DIV_ACTIVE + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Number of divide ops PMC0:EDGEDETECT +Avg. divide unit usage duration PMC1/PMC0:EDGEDETECT + +LONG +Formulas: +Number of divide ops = ARITH_FPU_DIV_ACTIVE:EDGEDETECT +Avg. divide unit usage duration = ARITH_FPU_DIV_ACTIVE/ARITH_FPU_DIV_ACTIVE:EDGEDETECT +- +This performance group measures the average latency of divide operations diff --git a/collectors/likwid/groups/broadwell/ENERGY.txt b/collectors/likwid/groups/broadwell/ENERGY.txt new file mode 100644 index 0000000..09eaeb1 --- /dev/null +++ b/collectors/likwid/groups/broadwell/ENERGY.txt @@ -0,0 +1,39 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +TMP0 TEMP_CORE +PWR0 PWR_PKG_ENERGY +PWR1 PWR_PP0_ENERGY +PWR2 PWR_PP1_ENERGY +PWR3 PWR_DRAM_ENERGY + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Temperature [C] TMP0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy PP0 [J] PWR1 +Power PP0 [W] PWR1/time +Energy PP1 [J] PWR2 +Power PP1 [W] PWR2/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Power PP0 = PWR_PP0_ENERGY / time +Power PP1 = PWR_PP1_ENERGY / time +Power DRAM = PWR_DRAM_ENERGY / time +- +Broadwell implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) and DRAM level. + diff --git a/collectors/likwid/groups/broadwell/FALSE_SHARE.txt b/collectors/likwid/groups/broadwell/FALSE_SHARE.txt new file mode 100644 index 0000000..a297654 --- /dev/null +++ b/collectors/likwid/groups/broadwell/FALSE_SHARE.txt @@ -0,0 +1,25 @@ +SHORT False sharing + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM +PMC2 MEM_UOPS_RETIRED_LOADS_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Local LLC false sharing [MByte] 1.E-06*PMC0*64 +Local LLC false sharing rate PMC0/PMC2 + +LONG +Formulas: +Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM*64 +Local LLC false sharing rate = MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM/MEM_UOPS_RETIRED_LOADS_ALL +- +False-sharing of cache lines can dramatically reduce the performance of an +application. This performance group measures the L3 traffic induced by false-sharing. +The false-sharing rate uses all memory load UOPs as reference. diff --git a/collectors/likwid/groups/broadwell/FLOPS_AVX.txt b/collectors/likwid/groups/broadwell/FLOPS_AVX.txt new file mode 100644 index 0000000..7854608 --- /dev/null +++ b/collectors/likwid/groups/broadwell/FLOPS_AVX.txt @@ -0,0 +1,24 @@ +SHORT Packed AVX MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE +PMC1 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Packed SP [MFLOP/s] 1.0E-06*(PMC0*8.0)/time +Packed DP [MFLOP/s] 1.0E-06*(PMC1*4.0)/time + +LONG +Formulas: +Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime +Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime +- +FLOP rates of 256 bit packed floating-point instructions + diff --git a/collectors/likwid/groups/broadwell/FLOPS_DP.txt b/collectors/likwid/groups/broadwell/FLOPS_DP.txt new file mode 100644 index 0000000..348ec76 --- /dev/null +++ b/collectors/likwid/groups/broadwell/FLOPS_DP.txt @@ -0,0 +1,31 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time +AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) + +LONG +Formulas: +DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime +AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime +Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE) +- +AVX/SSE scalar and packed double precision FLOP rates. + diff --git a/collectors/likwid/groups/broadwell/FLOPS_SP.txt b/collectors/likwid/groups/broadwell/FLOPS_SP.txt new file mode 100644 index 0000000..1d7fd7c --- /dev/null +++ b/collectors/likwid/groups/broadwell/FLOPS_SP.txt @@ -0,0 +1,31 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time +AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) + +LONG +Formulas: +SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime +AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime +Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE) +- +AVX/SSE scalar and packed single precision FLOP rates. + diff --git a/collectors/likwid/groups/broadwell/ICACHE.txt b/collectors/likwid/groups/broadwell/ICACHE.txt new file mode 100644 index 0000000..5f11ad6 --- /dev/null +++ b/collectors/likwid/groups/broadwell/ICACHE.txt @@ -0,0 +1,25 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ICACHE_ACCESSES +PMC1 ICACHE_MISSES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1I request rate PMC0/FIXC0 +L1I miss rate PMC1/FIXC0 +L1I miss ratio PMC1/PMC0 + +LONG +Formulas: +L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY +L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY +L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES +- +This group measures some L1 instruction cache metrics. diff --git a/collectors/likwid/groups/broadwell/L2.txt b/collectors/likwid/groups/broadwell/L2.txt new file mode 100644 index 0000000..60c7f79 --- /dev/null +++ b/collectors/likwid/groups/broadwell/L2.txt @@ -0,0 +1,37 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPLACEMENT +PMC1 L2_TRANS_L1D_WB +PMC2 ICACHE_MISSES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2D load data volume [GBytes] 1.0E-09*PMC0*64.0 +L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time +L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0 + +LONG +Formulas: +L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time +L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0 +L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time +L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0 +L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0/time +L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0 +- +Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the +number of cache line loaded from the L2 to the L2 data cache and the writebacks from +the L2 data cache to the L2 cache. The group also outputs total data volume transferred between +L2 and L1. Note that this bandwidth also includes data transfers due to a write +allocate load on a store miss in L1 and cache lines transferred it the instruction +cache. diff --git a/collectors/likwid/groups/broadwell/L2CACHE.txt b/collectors/likwid/groups/broadwell/L2CACHE.txt new file mode 100644 index 0000000..9b5dd4b --- /dev/null +++ b/collectors/likwid/groups/broadwell/L2CACHE.txt @@ -0,0 +1,34 @@ +SHORT L2 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_TRANS_ALL_REQUESTS +PMC1 L2_RQSTS_MISS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 request rate PMC0/FIXC0 +L2 miss rate PMC1/FIXC0 +L2 miss ratio PMC1/PMC0 + +LONG +Formulas: +L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY +L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY +L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS +- +This group measures the locality of your data accesses with regard to the +L2 cache. L2 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L2 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L2 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the# data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/broadwell/L3.txt b/collectors/likwid/groups/broadwell/L3.txt new file mode 100644 index 0000000..98d1d9e --- /dev/null +++ b/collectors/likwid/groups/broadwell/L3.txt @@ -0,0 +1,36 @@ +SHORT L3 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_LINES_IN_ALL +PMC1 L2_TRANS_L2_WB + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L3 load data volume [GBytes] 1.0E-09*PMC0*64.0 +L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 + +LONG +Formulas: +L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time +L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0 +L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time +L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0 +L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time +L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64 +- +Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L2 and the number of modified cache lines +evicted from the L2. This group also output data volume transferred between the +L3 and measured cores L2 caches. Note that this bandwidth also includes data +transfers due to a write allocate load on a store miss in L2. + diff --git a/collectors/likwid/groups/broadwell/L3CACHE.txt b/collectors/likwid/groups/broadwell/L3CACHE.txt new file mode 100644 index 0000000..f863daa --- /dev/null +++ b/collectors/likwid/groups/broadwell/L3CACHE.txt @@ -0,0 +1,35 @@ +SHORT L3 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_LOAD_UOPS_RETIRED_L3_ALL +PMC1 MEM_LOAD_UOPS_RETIRED_L3_MISS +PMC2 UOPS_RETIRED_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 request rate PMC0/PMC2 +L3 miss rate PMC1/PMC2 +L3 miss ratio PMC1/PMC0 + +LONG +Formulas: +L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL +L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL +L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL +- +This group measures the locality of your data accesses with regard to the +L3 cache. L3 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L3 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L3 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the# data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/broadwell/PORT_USAGE.txt b/collectors/likwid/groups/broadwell/PORT_USAGE.txt new file mode 100644 index 0000000..298df1d --- /dev/null +++ b/collectors/likwid/groups/broadwell/PORT_USAGE.txt @@ -0,0 +1,50 @@ +SHORT Execution port utilization + +REQUIRE_NOHT + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_EXECUTED_PORT_PORT_0 +PMC1 UOPS_EXECUTED_PORT_PORT_1 +PMC2 UOPS_EXECUTED_PORT_PORT_2 +PMC3 UOPS_EXECUTED_PORT_PORT_3 +PMC4 UOPS_EXECUTED_PORT_PORT_4 +PMC5 UOPS_EXECUTED_PORT_PORT_5 +PMC6 UOPS_EXECUTED_PORT_PORT_6 +PMC7 UOPS_EXECUTED_PORT_PORT_7 + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Port0 usage ratio PMC0/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port1 usage ratio PMC1/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port2 usage ratio PMC2/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port3 usage ratio PMC3/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port4 usage ratio PMC4/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port5 usage ratio PMC5/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port6 usage ratio PMC6/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port7 usage ratio PMC7/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) + +LONG +Formulas: +Port0 usage ratio = UOPS_EXECUTED_PORT_PORT_0/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port1 usage ratio = UOPS_EXECUTED_PORT_PORT_1/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port2 usage ratio = UOPS_EXECUTED_PORT_PORT_2/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port3 usage ratio = UOPS_EXECUTED_PORT_PORT_3/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port4 usage ratio = UOPS_EXECUTED_PORT_PORT_4/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port5 usage ratio = UOPS_EXECUTED_PORT_PORT_5/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port6 usage ratio = UOPS_EXECUTED_PORT_PORT_6/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port7 usage ratio = UOPS_EXECUTED_PORT_PORT_7/SUM(UOPS_EXECUTED_PORT_PORT_*) +- +This group measures the execution port utilization in a CPU core. The group can +only be measured when HyperThreading is disabled because only then each CPU core +can program eight counters. +Please be aware that the counters PMC4-7 are broken on Intel Broadwell. They +don't increment if either user- or kernel-level filtering is applied. User-level +filtering is default in LIKWID, hence kernel-level filtering is added +automatically for PMC4-7. The returned counts can be much higher. diff --git a/collectors/likwid/groups/broadwell/RECOVERY.txt b/collectors/likwid/groups/broadwell/RECOVERY.txt new file mode 100644 index 0000000..7928ee4 --- /dev/null +++ b/collectors/likwid/groups/broadwell/RECOVERY.txt @@ -0,0 +1,22 @@ +SHORT Recovery duration + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 INT_MISC_RECOVERY_CYCLES +PMC1 INT_MISC_RECOVERY_COUNT + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Avg. recovery duration PMC0/PMC1 + +LONG +Formulas: +Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT +- +This group measures the duration of recoveries after SSE exception, memory +disambiguation, etc... diff --git a/collectors/likwid/groups/broadwell/TLB_DATA.txt b/collectors/likwid/groups/broadwell/TLB_DATA.txt new file mode 100644 index 0000000..8d94e05 --- /dev/null +++ b/collectors/likwid/groups/broadwell/TLB_DATA.txt @@ -0,0 +1,35 @@ +SHORT L2 data TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 DTLB_LOAD_MISSES_CAUSES_A_WALK +PMC1 DTLB_STORE_MISSES_CAUSES_A_WALK +PMC2 DTLB_LOAD_MISSES_WALK_DURATION +PMC3 DTLB_STORE_MISSES_WALK_DURATION + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 DTLB load misses PMC0 +L1 DTLB load miss rate PMC0/FIXC0 +L1 DTLB load miss duration [Cyc] PMC2/PMC0 +L1 DTLB store misses PMC1 +L1 DTLB store miss rate PMC1/FIXC0 +L1 DTLB store miss duration [Cyc] PMC3/PMC1 + +LONG +Formulas: +L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK +L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK +L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK +L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK +- +The DTLB load and store miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/broadwell/TLB_INSTR.txt b/collectors/likwid/groups/broadwell/TLB_INSTR.txt new file mode 100644 index 0000000..235d977 --- /dev/null +++ b/collectors/likwid/groups/broadwell/TLB_INSTR.txt @@ -0,0 +1,28 @@ +SHORT L1 Instruction TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ITLB_MISSES_CAUSES_A_WALK +PMC1 ITLB_MISSES_WALK_DURATION + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 ITLB misses PMC0 +L1 ITLB miss rate PMC0/FIXC0 +L1 ITLB miss duration [Cyc] PMC1/PMC0 + + +LONG +Formulas: +L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK +L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK +- +The ITLB miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/broadwell/TMA.txt b/collectors/likwid/groups/broadwell/TMA.txt new file mode 100644 index 0000000..afb4126 --- /dev/null +++ b/collectors/likwid/groups/broadwell/TMA.txt @@ -0,0 +1,48 @@ +SHORT Top down cycle allocation + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_ANY +PMC1 UOPS_RETIRED_RETIRE_SLOTS +PMC2 IDQ_UOPS_NOT_DELIVERED_CORE +PMC3 INT_MISC_RECOVERY_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +IPC FIXC0/FIXC1 +Total Slots 4*FIXC1 +Slots Retired PMC1 +Fetch Bubbles PMC2 +Recovery Bubbles 4*PMC3 +Front End [%] PMC2/(4*FIXC1)*100 +Speculation [%] (PMC0-PMC1+(4*PMC3))/(4*FIXC1)*100 +Retiring [%] PMC1/(4*FIXC1)*100 +Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100 + +LONG +Formulas: +Total Slots = 4*CPU_CLK_UNHALTED_CORE +Slots Retired = UOPS_RETIRED_RETIRE_SLOTS +Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE +Recovery Bubbles = 4*INT_MISC_RECOVERY_CYCLES +Front End [%] = IDQ_UOPS_NOT_DELIVERED_CORE/(4*CPU_CLK_UNHALTED_CORE)*100 +Speculation [%] = (UOPS_ISSUED_ANY-UOPS_RETIRED_RETIRE_SLOTS+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)*100 +Retiring [%] = UOPS_RETIRED_RETIRE_SLOTS/(4*CPU_CLK_UNHALTED_CORE)*100 +Back End [%] = (1-((IDQ_UOPS_NOT_DELIVERED_CORE+UOPS_ISSUED_ANY+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)))*100 +-- +This performance group measures cycles to determine percentage of time spent in +front end, back end, retiring and speculation. These metrics are published and +verified by Intel. Further information: +Webpage describing Top-Down Method and its usage in Intel vTune: +https://software.intel.com/en-us/vtune-amplifier-help-tuning-applications-using-a-top-down-microarchitecture-analysis-method +Paper by Yasin Ahmad: +https://sites.google.com/site/analysismethods/yasin-pubs/TopDown-Yasin-ISPASS14.pdf?attredirects=0 +Slides by Yasin Ahmad: +http://www.cs.technion.ac.il/~erangi/TMA_using_Linux_perf__Ahmad_Yasin.pdf +The performance group was originally published here: +http://perf.mvermeulen.com/2018/04/14/top-down-performance-counter-analysis-part-1-likwid/ diff --git a/collectors/likwid/groups/broadwell/UOPS.txt b/collectors/likwid/groups/broadwell/UOPS.txt new file mode 100644 index 0000000..e6cc208 --- /dev/null +++ b/collectors/likwid/groups/broadwell/UOPS.txt @@ -0,0 +1,35 @@ +SHORT UOPs execution info + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_ANY +PMC1 UOPS_EXECUTED_THREAD +PMC2 UOPS_RETIRED_ALL +PMC3 UOPS_ISSUED_FLAGS_MERGE + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Issued UOPs PMC0 +Merged UOPs PMC3 +Executed UOPs PMC1 +Retired UOPs PMC2 + +LONG +Formulas: +Issued UOPs = UOPS_ISSUED_ANY +Merged UOPs = UOPS_ISSUED_FLAGS_MERGE +Executed UOPs = UOPS_EXECUTED_THREAD +Retired UOPs = UOPS_RETIRED_ALL +- +This group returns information about the instruction pipeline. It measures the +issued, executed and retired uOPs and returns the number of uOPs which were issued +but not executed as well as the number of uOPs which were executed but never retired. +The executed but not retired uOPs commonly come from speculatively executed branches. + diff --git a/collectors/likwid/groups/broadwellD/BRANCH.txt b/collectors/likwid/groups/broadwellD/BRANCH.txt new file mode 100644 index 0000000..b8d41b2 --- /dev/null +++ b/collectors/likwid/groups/broadwellD/BRANCH.txt @@ -0,0 +1,31 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 BR_INST_RETIRED_ALL_BRANCHES +PMC1 BR_MISP_RETIRED_ALL_BRANCHES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Branch rate PMC0/FIXC0 +Branch misprediction rate PMC1/FIXC0 +Branch misprediction ratio PMC1/PMC0 +Instructions per branch FIXC0/PMC0 + +LONG +Formulas: +Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES +Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/broadwellD/CACHES.txt b/collectors/likwid/groups/broadwellD/CACHES.txt new file mode 100644 index 0000000..275e30f --- /dev/null +++ b/collectors/likwid/groups/broadwellD/CACHES.txt @@ -0,0 +1,123 @@ +SHORT Cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPLACEMENT +PMC1 L2_TRANS_L1D_WB +PMC2 L2_LINES_IN_ALL +PMC3 L2_TRANS_L2_WB +CBOX0C1 LLC_VICTIMS_M +CBOX1C1 LLC_VICTIMS_M +CBOX2C1 LLC_VICTIMS_M +CBOX3C1 LLC_VICTIMS_M +CBOX4C1 LLC_VICTIMS_M +CBOX5C1 LLC_VICTIMS_M +CBOX6C1 LLC_VICTIMS_M +CBOX7C1 LLC_VICTIMS_M +CBOX8C1 LLC_VICTIMS_M +CBOX9C1 LLC_VICTIMS_M +CBOX10C1 LLC_VICTIMS_M +CBOX11C1 LLC_VICTIMS_M +CBOX12C1 LLC_VICTIMS_M +CBOX13C1 LLC_VICTIMS_M +CBOX14C1 LLC_VICTIMS_M +CBOX15C1 LLC_VICTIMS_M +CBOX0C0 LLC_LOOKUP_DATA_READ +CBOX1C0 LLC_LOOKUP_DATA_READ +CBOX2C0 LLC_LOOKUP_DATA_READ +CBOX3C0 LLC_LOOKUP_DATA_READ +CBOX4C0 LLC_LOOKUP_DATA_READ +CBOX5C0 LLC_LOOKUP_DATA_READ +CBOX6C0 LLC_LOOKUP_DATA_READ +CBOX7C0 LLC_LOOKUP_DATA_READ +CBOX8C0 LLC_LOOKUP_DATA_READ +CBOX9C0 LLC_LOOKUP_DATA_READ +CBOX10C0 LLC_LOOKUP_DATA_READ +CBOX11C0 LLC_LOOKUP_DATA_READ +CBOX12C0 LLC_LOOKUP_DATA_READ +CBOX13C0 LLC_LOOKUP_DATA_READ +CBOX14C0 LLC_LOOKUP_DATA_READ +CBOX15C0 LLC_LOOKUP_DATA_READ +MBOX0C0 CAS_COUNT_RD +MBOX0C1 CAS_COUNT_WR +MBOX1C0 CAS_COUNT_RD +MBOX1C1 CAS_COUNT_WR +MBOX2C0 CAS_COUNT_RD +MBOX2C1 CAS_COUNT_WR +MBOX3C0 CAS_COUNT_RD +MBOX3C1 CAS_COUNT_WR +MBOX4C0 CAS_COUNT_RD +MBOX4C1 CAS_COUNT_WR +MBOX5C0 CAS_COUNT_RD +MBOX5C1 CAS_COUNT_WR +MBOX6C0 CAS_COUNT_RD +MBOX6C1 CAS_COUNT_WR +MBOX7C0 CAS_COUNT_RD +MBOX7C1 CAS_COUNT_WR + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0 +L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 +L3 to L2 load bandwidth [MBytes/s] 1.0E-06*PMC2*64.0/time +L3 to L2 load data volume [GBytes] 1.0E-09*PMC2*64.0 +L2 to L3 evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time +L2 to L3 evict data volume [GBytes] 1.0E-09*PMC3*64.0 +L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time +L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0 +System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0)*64.0/time +System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0)*64.0 +L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1)*64/time +L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1)*64 +L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1)*64.0/time +L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1)*64.0 +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 + +LONG +Formulas: +L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time +L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64 +L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64/time +L1 to L2 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64 +L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB)*64/time +L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB)*64 +L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time +L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64 +L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time +L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64 +L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time +L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64 +System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ))*64/time +System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ))*64 +L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M))*64/time +L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M))*64 +L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M))*64/time +L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M))*64 +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time +Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time +Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time +Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0 +- +Group to measure cache transfers between L1 and Memory. Please notice that the +L3 to/from system metrics contain any traffic to the system (memory, +Intel QPI, etc.) but don't seem to handle anything because commonly memory read +bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth. + diff --git a/collectors/likwid/groups/broadwellD/CLOCK.txt b/collectors/likwid/groups/broadwellD/CLOCK.txt new file mode 100644 index 0000000..b81bee6 --- /dev/null +++ b/collectors/likwid/groups/broadwellD/CLOCK.txt @@ -0,0 +1,26 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY +UBOXFIX UNCORE_CLOCK + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +Uncore Clock [MHz] 1.E-06*UBOXFIX/time +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time +- +Broadwell implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) level. + diff --git a/collectors/likwid/groups/broadwellD/CYCLE_ACTIVITY.txt b/collectors/likwid/groups/broadwellD/CYCLE_ACTIVITY.txt new file mode 100644 index 0000000..c432a44 --- /dev/null +++ b/collectors/likwid/groups/broadwellD/CYCLE_ACTIVITY.txt @@ -0,0 +1,38 @@ +SHORT Cycle Activities + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING +PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING +PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING +PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Cycles without execution [%] (PMC3/FIXC1)*100 +Cycles without execution due to L1D [%] (PMC2/FIXC1)*100 +Cycles without execution due to L2 [%] (PMC0/FIXC1)*100 +Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100 + +LONG +Formulas: +Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100 +Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100 +Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100 +Cycles without execution due to memory loads [%] = CYCLE_ACTIVITY_CYCLES_LDM_PENDING/CPU_CLK_UNHALTED_CORE*100 +-- +This performance group measures the cycles while waiting for data from the cache +and memory hierarchy. +CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on +any execution port. +CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is +outstanding. +CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is +outstanding. +CYCLE_ACTIVITY_CYCLES_LDM_PENDING: Cycles while memory subsystem has an +outstanding load. diff --git a/collectors/likwid/groups/broadwellD/CYCLE_STALLS.txt b/collectors/likwid/groups/broadwellD/CYCLE_STALLS.txt new file mode 100644 index 0000000..795aeb9 --- /dev/null +++ b/collectors/likwid/groups/broadwellD/CYCLE_STALLS.txt @@ -0,0 +1,45 @@ +SHORT Cycle Activities (Stalls) + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING +PMC1 CYCLE_ACTIVITY_STALLS_LDM_PENDING +PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING +PMC3 CYCLE_ACTIVITY_STALLS_TOTAL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Total execution stalls PMC3 +Stalls caused by L1D misses [%] (PMC2/PMC3)*100 +Stalls caused by L2 misses [%] (PMC0/PMC3)*100 +Stalls caused by memory loads [%] (PMC1/PMC3)*100 +Execution stall rate [%] (PMC3/FIXC1)*100 +Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100 +Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100 +Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100 + +LONG +Formulas: +Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL +Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Stalls caused by memory loads [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by memory loads rate [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CPU_CLK_UNHALTED_CORE)*100 +-- +This performance group measures the stalls caused by data traffic in the cache +hierarchy. +CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls. +CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand +load is outstanding. +CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand +load is outstanding. +CYCLE_ACTIVITY_STALLS_LDM_PENDING: Execution stalls while memory subsystem has +an outstanding load. diff --git a/collectors/likwid/groups/broadwellD/DATA.txt b/collectors/likwid/groups/broadwellD/DATA.txt new file mode 100644 index 0000000..6955eb7 --- /dev/null +++ b/collectors/likwid/groups/broadwellD/DATA.txt @@ -0,0 +1,22 @@ +SHORT Load to store ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_UOPS_RETIRED_LOADS_ALL +PMC1 MEM_UOPS_RETIRED_STORES_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Load to store ratio PMC0/PMC1 + +LONG +Formulas: +Load to store ratio = MEM_UOPS_RETIRED_LOADS_ALL/MEM_UOPS_RETIRED_STORES_ALL +- +This is a metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/broadwellD/DIVIDE.txt b/collectors/likwid/groups/broadwellD/DIVIDE.txt new file mode 100644 index 0000000..c7c5fb2 --- /dev/null +++ b/collectors/likwid/groups/broadwellD/DIVIDE.txt @@ -0,0 +1,24 @@ +SHORT Divide unit information + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0:EDGEDETECT ARITH_FPU_DIV_ACTIVE +PMC1 ARITH_FPU_DIV_ACTIVE + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Number of divide ops PMC0:EDGEDETECT +Avg. divide unit usage duration PMC1/PMC0:EDGEDETECT + +LONG +Formulas: +Number of divide ops = ARITH_FPU_DIV_ACTIVE:EDGEDETECT +Avg. divide unit usage duration = ARITH_FPU_DIV_ACTIVE/ARITH_FPU_DIV_ACTIVE:EDGEDETECT +- +This performance group measures the average latency of divide operations diff --git a/collectors/likwid/groups/broadwellD/ENERGY.txt b/collectors/likwid/groups/broadwellD/ENERGY.txt new file mode 100644 index 0000000..09eaeb1 --- /dev/null +++ b/collectors/likwid/groups/broadwellD/ENERGY.txt @@ -0,0 +1,39 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +TMP0 TEMP_CORE +PWR0 PWR_PKG_ENERGY +PWR1 PWR_PP0_ENERGY +PWR2 PWR_PP1_ENERGY +PWR3 PWR_DRAM_ENERGY + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Temperature [C] TMP0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy PP0 [J] PWR1 +Power PP0 [W] PWR1/time +Energy PP1 [J] PWR2 +Power PP1 [W] PWR2/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Power PP0 = PWR_PP0_ENERGY / time +Power PP1 = PWR_PP1_ENERGY / time +Power DRAM = PWR_DRAM_ENERGY / time +- +Broadwell implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) and DRAM level. + diff --git a/collectors/likwid/groups/broadwellD/FALSE_SHARE.txt b/collectors/likwid/groups/broadwellD/FALSE_SHARE.txt new file mode 100644 index 0000000..68107bf --- /dev/null +++ b/collectors/likwid/groups/broadwellD/FALSE_SHARE.txt @@ -0,0 +1,25 @@ +SHORT False sharing + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM +PMC2 MEM_UOPS_RETIRED_LOADS_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Local LLC false sharing [MByte] 1.E-06*PMC0*64 +Local LLC false sharing rate PMC0/PMC2 + +LONG +Formulas: +Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM*64 +Local LLC false sharing rate = MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM/MEM_UOPS_RETIRED_LOADS_ALL +- +False-sharing of cache lines can dramatically reduce the performance of an +application. This performance group measures the L3 traffic induced by false-sharing. +The false-sharing rate uses all memory loads as reference. diff --git a/collectors/likwid/groups/broadwellD/FLOPS_AVX.txt b/collectors/likwid/groups/broadwellD/FLOPS_AVX.txt new file mode 100644 index 0000000..7854608 --- /dev/null +++ b/collectors/likwid/groups/broadwellD/FLOPS_AVX.txt @@ -0,0 +1,24 @@ +SHORT Packed AVX MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE +PMC1 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Packed SP [MFLOP/s] 1.0E-06*(PMC0*8.0)/time +Packed DP [MFLOP/s] 1.0E-06*(PMC1*4.0)/time + +LONG +Formulas: +Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime +Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime +- +FLOP rates of 256 bit packed floating-point instructions + diff --git a/collectors/likwid/groups/broadwellD/FLOPS_DP.txt b/collectors/likwid/groups/broadwellD/FLOPS_DP.txt new file mode 100644 index 0000000..348ec76 --- /dev/null +++ b/collectors/likwid/groups/broadwellD/FLOPS_DP.txt @@ -0,0 +1,31 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time +AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) + +LONG +Formulas: +DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime +AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime +Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE) +- +AVX/SSE scalar and packed double precision FLOP rates. + diff --git a/collectors/likwid/groups/broadwellD/FLOPS_SP.txt b/collectors/likwid/groups/broadwellD/FLOPS_SP.txt new file mode 100644 index 0000000..1d7fd7c --- /dev/null +++ b/collectors/likwid/groups/broadwellD/FLOPS_SP.txt @@ -0,0 +1,31 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time +AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) + +LONG +Formulas: +SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime +AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime +Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE) +- +AVX/SSE scalar and packed single precision FLOP rates. + diff --git a/collectors/likwid/groups/broadwellD/HA.txt b/collectors/likwid/groups/broadwellD/HA.txt new file mode 100644 index 0000000..1e5a700 --- /dev/null +++ b/collectors/likwid/groups/broadwellD/HA.txt @@ -0,0 +1,40 @@ +SHORT Main memory bandwidth in MBytes/s seen from Home agent + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +BBOX0C0 IMC_READS_NORMAL +BBOX0C1 BYPASS_IMC_TAKEN +BBOX0C2 IMC_WRITES_ALL +BBOX1C0 IMC_READS_NORMAL +BBOX1C1 BYPASS_IMC_TAKEN +BBOX1C2 IMC_WRITES_ALL + + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Memory read bandwidth [MBytes/s] 1.0E-06*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(BBOX0C2+BBOX1C2)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(BBOX0C2+BBOX1C2)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1+BBOX0C2+BBOX1C2)*64.0/time +Memory data volume [GBytes] 1.0E-09*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1+BBOX0C2+BBOX1C2)*64.0 + +LONG +Formulas: +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_READS_NORMAL)+SUM(BYPASS_IMC_TAKEN))*64.0/time +Memory read data volume [GBytes] = 1.0E-09*(SUM(IMC_READS_NORMAL)+SUM(BYPASS_IMC_TAKEN))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_WRITES_ALL))*64.0/time +Memory write data volume [GBytes] = 1.0E-09*(SUM(IMC_WRITES_ALL))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_READS_NORMAL) + SUM(BYPASS_IMC_TAKEN) + SUM(IMC_WRITES_ALL))*64.0/time +Memory data volume [GBytes] = 1.0E-09*(SUM(IMC_READS_NORMAL) + SUM(BYPASS_IMC_TAKEN) + SUM(IMC_WRITES_ALL))*64.0 +- +This group derives the same metrics as the MEM group but use the events of the +Home Agent, a central unit that is responsible for the protocol side of memory +interactions. diff --git a/collectors/likwid/groups/broadwellD/ICACHE.txt b/collectors/likwid/groups/broadwellD/ICACHE.txt new file mode 100644 index 0000000..5f11ad6 --- /dev/null +++ b/collectors/likwid/groups/broadwellD/ICACHE.txt @@ -0,0 +1,25 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ICACHE_ACCESSES +PMC1 ICACHE_MISSES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1I request rate PMC0/FIXC0 +L1I miss rate PMC1/FIXC0 +L1I miss ratio PMC1/PMC0 + +LONG +Formulas: +L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY +L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY +L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES +- +This group measures some L1 instruction cache metrics. diff --git a/collectors/likwid/groups/broadwellD/L2.txt b/collectors/likwid/groups/broadwellD/L2.txt new file mode 100644 index 0000000..60c7f79 --- /dev/null +++ b/collectors/likwid/groups/broadwellD/L2.txt @@ -0,0 +1,37 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPLACEMENT +PMC1 L2_TRANS_L1D_WB +PMC2 ICACHE_MISSES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2D load data volume [GBytes] 1.0E-09*PMC0*64.0 +L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time +L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0 + +LONG +Formulas: +L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time +L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0 +L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time +L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0 +L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0/time +L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0 +- +Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the +number of cache line loaded from the L2 to the L2 data cache and the writebacks from +the L2 data cache to the L2 cache. The group also outputs total data volume transferred between +L2 and L1. Note that this bandwidth also includes data transfers due to a write +allocate load on a store miss in L1 and cache lines transferred it the instruction +cache. diff --git a/collectors/likwid/groups/broadwellD/L2CACHE.txt b/collectors/likwid/groups/broadwellD/L2CACHE.txt new file mode 100644 index 0000000..9b5dd4b --- /dev/null +++ b/collectors/likwid/groups/broadwellD/L2CACHE.txt @@ -0,0 +1,34 @@ +SHORT L2 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_TRANS_ALL_REQUESTS +PMC1 L2_RQSTS_MISS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 request rate PMC0/FIXC0 +L2 miss rate PMC1/FIXC0 +L2 miss ratio PMC1/PMC0 + +LONG +Formulas: +L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY +L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY +L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS +- +This group measures the locality of your data accesses with regard to the +L2 cache. L2 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L2 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L2 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the# data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/broadwellD/L3.txt b/collectors/likwid/groups/broadwellD/L3.txt new file mode 100644 index 0000000..98d1d9e --- /dev/null +++ b/collectors/likwid/groups/broadwellD/L3.txt @@ -0,0 +1,36 @@ +SHORT L3 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_LINES_IN_ALL +PMC1 L2_TRANS_L2_WB + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L3 load data volume [GBytes] 1.0E-09*PMC0*64.0 +L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 + +LONG +Formulas: +L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time +L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0 +L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time +L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0 +L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time +L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64 +- +Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L2 and the number of modified cache lines +evicted from the L2. This group also output data volume transferred between the +L3 and measured cores L2 caches. Note that this bandwidth also includes data +transfers due to a write allocate load on a store miss in L2. + diff --git a/collectors/likwid/groups/broadwellD/L3CACHE.txt b/collectors/likwid/groups/broadwellD/L3CACHE.txt new file mode 100644 index 0000000..f863daa --- /dev/null +++ b/collectors/likwid/groups/broadwellD/L3CACHE.txt @@ -0,0 +1,35 @@ +SHORT L3 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_LOAD_UOPS_RETIRED_L3_ALL +PMC1 MEM_LOAD_UOPS_RETIRED_L3_MISS +PMC2 UOPS_RETIRED_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 request rate PMC0/PMC2 +L3 miss rate PMC1/PMC2 +L3 miss ratio PMC1/PMC0 + +LONG +Formulas: +L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL +L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL +L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL +- +This group measures the locality of your data accesses with regard to the +L3 cache. L3 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L3 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L3 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the# data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/broadwellD/MEM.txt b/collectors/likwid/groups/broadwellD/MEM.txt new file mode 100644 index 0000000..2a17a2c --- /dev/null +++ b/collectors/likwid/groups/broadwellD/MEM.txt @@ -0,0 +1,52 @@ +SHORT Main memory bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +MBOX0C0 CAS_COUNT_RD +MBOX0C1 CAS_COUNT_WR +MBOX1C0 CAS_COUNT_RD +MBOX1C1 CAS_COUNT_WR +MBOX2C0 CAS_COUNT_RD +MBOX2C1 CAS_COUNT_WR +MBOX3C0 CAS_COUNT_RD +MBOX3C1 CAS_COUNT_WR +MBOX4C0 CAS_COUNT_RD +MBOX4C1 CAS_COUNT_WR +MBOX5C0 CAS_COUNT_RD +MBOX5C1 CAS_COUNT_WR +MBOX6C0 CAS_COUNT_RD +MBOX6C1 CAS_COUNT_WR +MBOX7C0 CAS_COUNT_RD +MBOX7C1 CAS_COUNT_WR + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 + +LONG +Formulas: +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime +Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime +Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime +Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0 +- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Since this group is based on Uncore events it is only possible to measure on a +per socket base. Some of the counters may not be available on your system. +Also outputs total data volume transferred from main memory. +The same metrics are provided by the HA group. + diff --git a/collectors/likwid/groups/broadwellD/MEM_DP.txt b/collectors/likwid/groups/broadwellD/MEM_DP.txt new file mode 100644 index 0000000..71ce2ae --- /dev/null +++ b/collectors/likwid/groups/broadwellD/MEM_DP.txt @@ -0,0 +1,73 @@ +SHORT Overview of arithmetic and main memory performance + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY +PWR3 PWR_DRAM_ENERGY +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE +MBOX0C0 CAS_COUNT_RD +MBOX0C1 CAS_COUNT_WR +MBOX1C0 CAS_COUNT_RD +MBOX1C1 CAS_COUNT_WR +MBOX2C0 CAS_COUNT_RD +MBOX2C1 CAS_COUNT_WR +MBOX3C0 CAS_COUNT_RD +MBOX3C1 CAS_COUNT_WR +MBOX4C0 CAS_COUNT_RD +MBOX4C1 CAS_COUNT_WR +MBOX5C0 CAS_COUNT_RD +MBOX5C1 CAS_COUNT_WR +MBOX6C0 CAS_COUNT_RD +MBOX6C1 CAS_COUNT_WR +MBOX7C0 CAS_COUNT_RD +MBOX7C1 CAS_COUNT_WR + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time +MFLOP/s 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time +AVX [MFLOP/s] 1.0E-06*(PMC2*4.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 +Operational intensity (PMC0*2.0+PMC1+PMC2*4.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0) + +LONG +Formulas: +Power [W] = PWR_PKG_ENERGY/runtime +Power DRAM [W] = PWR_DRAM_ENERGY/runtime +MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime +AVX [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime +Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime +Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime +Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0 +Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/((SUM(MBOXxC0)+SUM(MBOXxC1))*64.0) +-- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Since this group is based on Uncore events it is only possible to measure on +a per socket base. Also outputs total data volume transferred from main memory. +SSE scalar and packed double precision FLOP rates. Also reports on packed AVX +32b instructions. +The operational intensity is calculated using the FP values of the cores and the +memory data volume of the whole socket. The actual operational intensity for +multiple CPUs can be found in the statistics table in the Sum column. diff --git a/collectors/likwid/groups/broadwellD/MEM_SP.txt b/collectors/likwid/groups/broadwellD/MEM_SP.txt new file mode 100644 index 0000000..6d67ea7 --- /dev/null +++ b/collectors/likwid/groups/broadwellD/MEM_SP.txt @@ -0,0 +1,73 @@ +SHORT Overview of arithmetic and main memory performance + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY +PWR3 PWR_DRAM_ENERGY +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE +MBOX0C0 CAS_COUNT_RD +MBOX0C1 CAS_COUNT_WR +MBOX1C0 CAS_COUNT_RD +MBOX1C1 CAS_COUNT_WR +MBOX2C0 CAS_COUNT_RD +MBOX2C1 CAS_COUNT_WR +MBOX3C0 CAS_COUNT_RD +MBOX3C1 CAS_COUNT_WR +MBOX4C0 CAS_COUNT_RD +MBOX4C1 CAS_COUNT_WR +MBOX5C0 CAS_COUNT_RD +MBOX5C1 CAS_COUNT_WR +MBOX6C0 CAS_COUNT_RD +MBOX6C1 CAS_COUNT_WR +MBOX7C0 CAS_COUNT_RD +MBOX7C1 CAS_COUNT_WR + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time +MFLOP/s 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time +AVX [MFLOP/s] 1.0E-06*(PMC2*8.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 +Operational intensity (PMC0*4.0+PMC1+PMC2*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0) + +LONG +Formulas: +Power [W] = PWR_PKG_ENERGY/runtime +Power DRAM [W] = PWR_DRAM_ENERGY/runtime +MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime +AVX [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime +Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime +Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime +Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0 +Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/((SUM(MBOXxC0)+SUM(MBOXxC1))*64.0) +-- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Since this group is based on Uncore events it is only possible to measure on +a per socket base. Also outputs total data volume transferred from main memory. +SSE scalar and packed single precision FLOP rates. Also reports on packed AVX +32b instructions. +The operational intensity is calculated using the FP values of the cores and the +memory data volume of the whole socket. The actual operational intensity for +multiple CPUs can be found in the statistics table in the Sum column. diff --git a/collectors/likwid/groups/broadwellD/PORT_USAGE.txt b/collectors/likwid/groups/broadwellD/PORT_USAGE.txt new file mode 100644 index 0000000..298df1d --- /dev/null +++ b/collectors/likwid/groups/broadwellD/PORT_USAGE.txt @@ -0,0 +1,50 @@ +SHORT Execution port utilization + +REQUIRE_NOHT + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_EXECUTED_PORT_PORT_0 +PMC1 UOPS_EXECUTED_PORT_PORT_1 +PMC2 UOPS_EXECUTED_PORT_PORT_2 +PMC3 UOPS_EXECUTED_PORT_PORT_3 +PMC4 UOPS_EXECUTED_PORT_PORT_4 +PMC5 UOPS_EXECUTED_PORT_PORT_5 +PMC6 UOPS_EXECUTED_PORT_PORT_6 +PMC7 UOPS_EXECUTED_PORT_PORT_7 + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Port0 usage ratio PMC0/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port1 usage ratio PMC1/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port2 usage ratio PMC2/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port3 usage ratio PMC3/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port4 usage ratio PMC4/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port5 usage ratio PMC5/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port6 usage ratio PMC6/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port7 usage ratio PMC7/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) + +LONG +Formulas: +Port0 usage ratio = UOPS_EXECUTED_PORT_PORT_0/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port1 usage ratio = UOPS_EXECUTED_PORT_PORT_1/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port2 usage ratio = UOPS_EXECUTED_PORT_PORT_2/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port3 usage ratio = UOPS_EXECUTED_PORT_PORT_3/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port4 usage ratio = UOPS_EXECUTED_PORT_PORT_4/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port5 usage ratio = UOPS_EXECUTED_PORT_PORT_5/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port6 usage ratio = UOPS_EXECUTED_PORT_PORT_6/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port7 usage ratio = UOPS_EXECUTED_PORT_PORT_7/SUM(UOPS_EXECUTED_PORT_PORT_*) +- +This group measures the execution port utilization in a CPU core. The group can +only be measured when HyperThreading is disabled because only then each CPU core +can program eight counters. +Please be aware that the counters PMC4-7 are broken on Intel Broadwell. They +don't increment if either user- or kernel-level filtering is applied. User-level +filtering is default in LIKWID, hence kernel-level filtering is added +automatically for PMC4-7. The returned counts can be much higher. diff --git a/collectors/likwid/groups/broadwellD/RECOVERY.txt b/collectors/likwid/groups/broadwellD/RECOVERY.txt new file mode 100644 index 0000000..7928ee4 --- /dev/null +++ b/collectors/likwid/groups/broadwellD/RECOVERY.txt @@ -0,0 +1,22 @@ +SHORT Recovery duration + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 INT_MISC_RECOVERY_CYCLES +PMC1 INT_MISC_RECOVERY_COUNT + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Avg. recovery duration PMC0/PMC1 + +LONG +Formulas: +Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT +- +This group measures the duration of recoveries after SSE exception, memory +disambiguation, etc... diff --git a/collectors/likwid/groups/broadwellD/TLB_DATA.txt b/collectors/likwid/groups/broadwellD/TLB_DATA.txt new file mode 100644 index 0000000..8d94e05 --- /dev/null +++ b/collectors/likwid/groups/broadwellD/TLB_DATA.txt @@ -0,0 +1,35 @@ +SHORT L2 data TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 DTLB_LOAD_MISSES_CAUSES_A_WALK +PMC1 DTLB_STORE_MISSES_CAUSES_A_WALK +PMC2 DTLB_LOAD_MISSES_WALK_DURATION +PMC3 DTLB_STORE_MISSES_WALK_DURATION + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 DTLB load misses PMC0 +L1 DTLB load miss rate PMC0/FIXC0 +L1 DTLB load miss duration [Cyc] PMC2/PMC0 +L1 DTLB store misses PMC1 +L1 DTLB store miss rate PMC1/FIXC0 +L1 DTLB store miss duration [Cyc] PMC3/PMC1 + +LONG +Formulas: +L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK +L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK +L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK +L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK +- +The DTLB load and store miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/broadwellD/TLB_INSTR.txt b/collectors/likwid/groups/broadwellD/TLB_INSTR.txt new file mode 100644 index 0000000..235d977 --- /dev/null +++ b/collectors/likwid/groups/broadwellD/TLB_INSTR.txt @@ -0,0 +1,28 @@ +SHORT L1 Instruction TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ITLB_MISSES_CAUSES_A_WALK +PMC1 ITLB_MISSES_WALK_DURATION + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 ITLB misses PMC0 +L1 ITLB miss rate PMC0/FIXC0 +L1 ITLB miss duration [Cyc] PMC1/PMC0 + + +LONG +Formulas: +L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK +L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK +- +The ITLB miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/broadwellD/TMA.txt b/collectors/likwid/groups/broadwellD/TMA.txt new file mode 100644 index 0000000..afb4126 --- /dev/null +++ b/collectors/likwid/groups/broadwellD/TMA.txt @@ -0,0 +1,48 @@ +SHORT Top down cycle allocation + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_ANY +PMC1 UOPS_RETIRED_RETIRE_SLOTS +PMC2 IDQ_UOPS_NOT_DELIVERED_CORE +PMC3 INT_MISC_RECOVERY_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +IPC FIXC0/FIXC1 +Total Slots 4*FIXC1 +Slots Retired PMC1 +Fetch Bubbles PMC2 +Recovery Bubbles 4*PMC3 +Front End [%] PMC2/(4*FIXC1)*100 +Speculation [%] (PMC0-PMC1+(4*PMC3))/(4*FIXC1)*100 +Retiring [%] PMC1/(4*FIXC1)*100 +Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100 + +LONG +Formulas: +Total Slots = 4*CPU_CLK_UNHALTED_CORE +Slots Retired = UOPS_RETIRED_RETIRE_SLOTS +Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE +Recovery Bubbles = 4*INT_MISC_RECOVERY_CYCLES +Front End [%] = IDQ_UOPS_NOT_DELIVERED_CORE/(4*CPU_CLK_UNHALTED_CORE)*100 +Speculation [%] = (UOPS_ISSUED_ANY-UOPS_RETIRED_RETIRE_SLOTS+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)*100 +Retiring [%] = UOPS_RETIRED_RETIRE_SLOTS/(4*CPU_CLK_UNHALTED_CORE)*100 +Back End [%] = (1-((IDQ_UOPS_NOT_DELIVERED_CORE+UOPS_ISSUED_ANY+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)))*100 +-- +This performance group measures cycles to determine percentage of time spent in +front end, back end, retiring and speculation. These metrics are published and +verified by Intel. Further information: +Webpage describing Top-Down Method and its usage in Intel vTune: +https://software.intel.com/en-us/vtune-amplifier-help-tuning-applications-using-a-top-down-microarchitecture-analysis-method +Paper by Yasin Ahmad: +https://sites.google.com/site/analysismethods/yasin-pubs/TopDown-Yasin-ISPASS14.pdf?attredirects=0 +Slides by Yasin Ahmad: +http://www.cs.technion.ac.il/~erangi/TMA_using_Linux_perf__Ahmad_Yasin.pdf +The performance group was originally published here: +http://perf.mvermeulen.com/2018/04/14/top-down-performance-counter-analysis-part-1-likwid/ diff --git a/collectors/likwid/groups/broadwellD/UOPS.txt b/collectors/likwid/groups/broadwellD/UOPS.txt new file mode 100644 index 0000000..e6cc208 --- /dev/null +++ b/collectors/likwid/groups/broadwellD/UOPS.txt @@ -0,0 +1,35 @@ +SHORT UOPs execution info + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_ANY +PMC1 UOPS_EXECUTED_THREAD +PMC2 UOPS_RETIRED_ALL +PMC3 UOPS_ISSUED_FLAGS_MERGE + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Issued UOPs PMC0 +Merged UOPs PMC3 +Executed UOPs PMC1 +Retired UOPs PMC2 + +LONG +Formulas: +Issued UOPs = UOPS_ISSUED_ANY +Merged UOPs = UOPS_ISSUED_FLAGS_MERGE +Executed UOPs = UOPS_EXECUTED_THREAD +Retired UOPs = UOPS_RETIRED_ALL +- +This group returns information about the instruction pipeline. It measures the +issued, executed and retired uOPs and returns the number of uOPs which were issued +but not executed as well as the number of uOPs which were executed but never retired. +The executed but not retired uOPs commonly come from speculatively executed branches. + diff --git a/collectors/likwid/groups/broadwellEP/BRANCH.txt b/collectors/likwid/groups/broadwellEP/BRANCH.txt new file mode 100644 index 0000000..b8d41b2 --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/BRANCH.txt @@ -0,0 +1,31 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 BR_INST_RETIRED_ALL_BRANCHES +PMC1 BR_MISP_RETIRED_ALL_BRANCHES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Branch rate PMC0/FIXC0 +Branch misprediction rate PMC1/FIXC0 +Branch misprediction ratio PMC1/PMC0 +Instructions per branch FIXC0/PMC0 + +LONG +Formulas: +Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES +Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/broadwellEP/CACHES.txt b/collectors/likwid/groups/broadwellEP/CACHES.txt new file mode 100644 index 0000000..6a14e52 --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/CACHES.txt @@ -0,0 +1,135 @@ +SHORT Cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPLACEMENT +PMC1 L2_TRANS_L1D_WB +PMC2 L2_LINES_IN_ALL +PMC3 L2_TRANS_L2_WB +CBOX0C1 LLC_VICTIMS_M +CBOX1C1 LLC_VICTIMS_M +CBOX2C1 LLC_VICTIMS_M +CBOX3C1 LLC_VICTIMS_M +CBOX4C1 LLC_VICTIMS_M +CBOX5C1 LLC_VICTIMS_M +CBOX6C1 LLC_VICTIMS_M +CBOX7C1 LLC_VICTIMS_M +CBOX8C1 LLC_VICTIMS_M +CBOX9C1 LLC_VICTIMS_M +CBOX10C1 LLC_VICTIMS_M +CBOX11C1 LLC_VICTIMS_M +CBOX12C1 LLC_VICTIMS_M +CBOX13C1 LLC_VICTIMS_M +CBOX14C1 LLC_VICTIMS_M +CBOX15C1 LLC_VICTIMS_M +CBOX16C1 LLC_VICTIMS_M +CBOX17C1 LLC_VICTIMS_M +CBOX18C1 LLC_VICTIMS_M +CBOX19C1 LLC_VICTIMS_M +CBOX20C1 LLC_VICTIMS_M +CBOX21C1 LLC_VICTIMS_M +CBOX0C0 LLC_LOOKUP_DATA_READ +CBOX1C0 LLC_LOOKUP_DATA_READ +CBOX2C0 LLC_LOOKUP_DATA_READ +CBOX3C0 LLC_LOOKUP_DATA_READ +CBOX4C0 LLC_LOOKUP_DATA_READ +CBOX5C0 LLC_LOOKUP_DATA_READ +CBOX6C0 LLC_LOOKUP_DATA_READ +CBOX7C0 LLC_LOOKUP_DATA_READ +CBOX8C0 LLC_LOOKUP_DATA_READ +CBOX9C0 LLC_LOOKUP_DATA_READ +CBOX10C0 LLC_LOOKUP_DATA_READ +CBOX11C0 LLC_LOOKUP_DATA_READ +CBOX12C0 LLC_LOOKUP_DATA_READ +CBOX13C0 LLC_LOOKUP_DATA_READ +CBOX14C0 LLC_LOOKUP_DATA_READ +CBOX15C0 LLC_LOOKUP_DATA_READ +CBOX16C0 LLC_LOOKUP_DATA_READ +CBOX17C0 LLC_LOOKUP_DATA_READ +CBOX18C0 LLC_LOOKUP_DATA_READ +CBOX19C0 LLC_LOOKUP_DATA_READ +CBOX20C0 LLC_LOOKUP_DATA_READ +CBOX21C0 LLC_LOOKUP_DATA_READ +MBOX0C0 CAS_COUNT_RD +MBOX0C1 CAS_COUNT_WR +MBOX1C0 CAS_COUNT_RD +MBOX1C1 CAS_COUNT_WR +MBOX2C0 CAS_COUNT_RD +MBOX2C1 CAS_COUNT_WR +MBOX3C0 CAS_COUNT_RD +MBOX3C1 CAS_COUNT_WR +MBOX4C0 CAS_COUNT_RD +MBOX4C1 CAS_COUNT_WR +MBOX5C0 CAS_COUNT_RD +MBOX5C1 CAS_COUNT_WR +MBOX6C0 CAS_COUNT_RD +MBOX6C1 CAS_COUNT_WR +MBOX7C0 CAS_COUNT_RD +MBOX7C1 CAS_COUNT_WR + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0 +L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 +L3 to L2 load bandwidth [MBytes/s] 1.0E-06*PMC2*64.0/time +L3 to L2 load data volume [GBytes] 1.0E-09*PMC2*64.0 +L2 to L3 evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time +L2 to L3 evict data volume [GBytes] 1.0E-09*PMC3*64.0 +L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time +L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0 +System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0)*64.0/time +System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0)*64.0 +L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1)*64/time +L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1)*64 +L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1)*64.0/time +L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1)*64.0 +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 + +LONG +Formulas: +L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time +L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64 +L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time +L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64 +L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time +L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64 +L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time +L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64 +L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time +L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64 +L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time +L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64 +System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ))*64/time +System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ))*64 +L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M))*64/time +L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M))*64 +L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M))*64/time +L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M))*64 +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time +Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time +Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time +Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0 +- +Group to measure cache transfers between L1 and Memory. Please notice that the +L3 to/from system metrics contain any traffic to the system (memory, +Intel QPI, etc.) but don't seem to handle anything because commonly memory read +bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth. + diff --git a/collectors/likwid/groups/broadwellEP/CLOCK.txt b/collectors/likwid/groups/broadwellEP/CLOCK.txt new file mode 100644 index 0000000..b81bee6 --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/CLOCK.txt @@ -0,0 +1,26 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY +UBOXFIX UNCORE_CLOCK + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +Uncore Clock [MHz] 1.E-06*UBOXFIX/time +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time +- +Broadwell implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) level. + diff --git a/collectors/likwid/groups/broadwellEP/CYCLE_ACTIVITY.txt b/collectors/likwid/groups/broadwellEP/CYCLE_ACTIVITY.txt new file mode 100644 index 0000000..c432a44 --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/CYCLE_ACTIVITY.txt @@ -0,0 +1,38 @@ +SHORT Cycle Activities + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING +PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING +PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING +PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Cycles without execution [%] (PMC3/FIXC1)*100 +Cycles without execution due to L1D [%] (PMC2/FIXC1)*100 +Cycles without execution due to L2 [%] (PMC0/FIXC1)*100 +Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100 + +LONG +Formulas: +Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100 +Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100 +Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100 +Cycles without execution due to memory loads [%] = CYCLE_ACTIVITY_CYCLES_LDM_PENDING/CPU_CLK_UNHALTED_CORE*100 +-- +This performance group measures the cycles while waiting for data from the cache +and memory hierarchy. +CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on +any execution port. +CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is +outstanding. +CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is +outstanding. +CYCLE_ACTIVITY_CYCLES_LDM_PENDING: Cycles while memory subsystem has an +outstanding load. diff --git a/collectors/likwid/groups/broadwellEP/CYCLE_STALLS.txt b/collectors/likwid/groups/broadwellEP/CYCLE_STALLS.txt new file mode 100644 index 0000000..795aeb9 --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/CYCLE_STALLS.txt @@ -0,0 +1,45 @@ +SHORT Cycle Activities (Stalls) + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING +PMC1 CYCLE_ACTIVITY_STALLS_LDM_PENDING +PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING +PMC3 CYCLE_ACTIVITY_STALLS_TOTAL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Total execution stalls PMC3 +Stalls caused by L1D misses [%] (PMC2/PMC3)*100 +Stalls caused by L2 misses [%] (PMC0/PMC3)*100 +Stalls caused by memory loads [%] (PMC1/PMC3)*100 +Execution stall rate [%] (PMC3/FIXC1)*100 +Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100 +Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100 +Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100 + +LONG +Formulas: +Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL +Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Stalls caused by memory loads [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by memory loads rate [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CPU_CLK_UNHALTED_CORE)*100 +-- +This performance group measures the stalls caused by data traffic in the cache +hierarchy. +CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls. +CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand +load is outstanding. +CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand +load is outstanding. +CYCLE_ACTIVITY_STALLS_LDM_PENDING: Execution stalls while memory subsystem has +an outstanding load. diff --git a/collectors/likwid/groups/broadwellEP/DATA.txt b/collectors/likwid/groups/broadwellEP/DATA.txt new file mode 100644 index 0000000..6955eb7 --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/DATA.txt @@ -0,0 +1,22 @@ +SHORT Load to store ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_UOPS_RETIRED_LOADS_ALL +PMC1 MEM_UOPS_RETIRED_STORES_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Load to store ratio PMC0/PMC1 + +LONG +Formulas: +Load to store ratio = MEM_UOPS_RETIRED_LOADS_ALL/MEM_UOPS_RETIRED_STORES_ALL +- +This is a metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/broadwellEP/DIVIDE.txt b/collectors/likwid/groups/broadwellEP/DIVIDE.txt new file mode 100644 index 0000000..c7c5fb2 --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/DIVIDE.txt @@ -0,0 +1,24 @@ +SHORT Divide unit information + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0:EDGEDETECT ARITH_FPU_DIV_ACTIVE +PMC1 ARITH_FPU_DIV_ACTIVE + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Number of divide ops PMC0:EDGEDETECT +Avg. divide unit usage duration PMC1/PMC0:EDGEDETECT + +LONG +Formulas: +Number of divide ops = ARITH_FPU_DIV_ACTIVE:EDGEDETECT +Avg. divide unit usage duration = ARITH_FPU_DIV_ACTIVE/ARITH_FPU_DIV_ACTIVE:EDGEDETECT +- +This performance group measures the average latency of divide operations diff --git a/collectors/likwid/groups/broadwellEP/ENERGY.txt b/collectors/likwid/groups/broadwellEP/ENERGY.txt new file mode 100644 index 0000000..fe7829f --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/ENERGY.txt @@ -0,0 +1,35 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +TMP0 TEMP_CORE +PWR0 PWR_PKG_ENERGY +PWR1 PWR_PP0_ENERGY +PWR3 PWR_DRAM_ENERGY + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Temperature [C] TMP0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy PP0 [J] PWR1 +Power PP0 [W] PWR1/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Power PP0 = PWR_PP0_ENERGY / time +Power DRAM = PWR_DRAM_ENERGY / time +- +Broadwell implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) and DRAM level. + diff --git a/collectors/likwid/groups/broadwellEP/FALSE_SHARE.txt b/collectors/likwid/groups/broadwellEP/FALSE_SHARE.txt new file mode 100644 index 0000000..602b606 --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/FALSE_SHARE.txt @@ -0,0 +1,30 @@ +SHORT False sharing + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM +PMC1 MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM +PMC2 MEM_UOPS_RETIRED_LOADS_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Local LLC false sharing [MByte] 1.E-06*PMC0*64 +Local LLC false sharing rate PMC0/PMC2 +Remote LLC false sharing [MByte] 1.E-06*PMC1*64 +Remote LLC false sharing rate PMC1/PMC2 + +LONG +Formulas: +Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM*64 +Local LLC false sharing rate = MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM/MEM_UOPS_RETIRED_LOADS_ALL +Remote LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM*64 +Remote LLC false sharing rate = MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM/MEM_LOAD_UOPS_RETIRED_ALL +- +False-sharing of cache lines can dramatically reduce the performance of an +application. This performance group measures the L3 traffic induced by false-sharing. +The false-sharing rate uses all memory load UOPs as reference. diff --git a/collectors/likwid/groups/broadwellEP/FLOPS_AVX.txt b/collectors/likwid/groups/broadwellEP/FLOPS_AVX.txt new file mode 100644 index 0000000..7854608 --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/FLOPS_AVX.txt @@ -0,0 +1,24 @@ +SHORT Packed AVX MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE +PMC1 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Packed SP [MFLOP/s] 1.0E-06*(PMC0*8.0)/time +Packed DP [MFLOP/s] 1.0E-06*(PMC1*4.0)/time + +LONG +Formulas: +Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime +Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime +- +FLOP rates of 256 bit packed floating-point instructions + diff --git a/collectors/likwid/groups/broadwellEP/FLOPS_DP.txt b/collectors/likwid/groups/broadwellEP/FLOPS_DP.txt new file mode 100644 index 0000000..348ec76 --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/FLOPS_DP.txt @@ -0,0 +1,31 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time +AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) + +LONG +Formulas: +DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime +AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime +Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE) +- +AVX/SSE scalar and packed double precision FLOP rates. + diff --git a/collectors/likwid/groups/broadwellEP/FLOPS_SP.txt b/collectors/likwid/groups/broadwellEP/FLOPS_SP.txt new file mode 100644 index 0000000..1d7fd7c --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/FLOPS_SP.txt @@ -0,0 +1,31 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time +AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) + +LONG +Formulas: +SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime +AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime +Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE) +- +AVX/SSE scalar and packed single precision FLOP rates. + diff --git a/collectors/likwid/groups/broadwellEP/HA.txt b/collectors/likwid/groups/broadwellEP/HA.txt new file mode 100644 index 0000000..1e5a700 --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/HA.txt @@ -0,0 +1,40 @@ +SHORT Main memory bandwidth in MBytes/s seen from Home agent + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +BBOX0C0 IMC_READS_NORMAL +BBOX0C1 BYPASS_IMC_TAKEN +BBOX0C2 IMC_WRITES_ALL +BBOX1C0 IMC_READS_NORMAL +BBOX1C1 BYPASS_IMC_TAKEN +BBOX1C2 IMC_WRITES_ALL + + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Memory read bandwidth [MBytes/s] 1.0E-06*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(BBOX0C2+BBOX1C2)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(BBOX0C2+BBOX1C2)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1+BBOX0C2+BBOX1C2)*64.0/time +Memory data volume [GBytes] 1.0E-09*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1+BBOX0C2+BBOX1C2)*64.0 + +LONG +Formulas: +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_READS_NORMAL)+SUM(BYPASS_IMC_TAKEN))*64.0/time +Memory read data volume [GBytes] = 1.0E-09*(SUM(IMC_READS_NORMAL)+SUM(BYPASS_IMC_TAKEN))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_WRITES_ALL))*64.0/time +Memory write data volume [GBytes] = 1.0E-09*(SUM(IMC_WRITES_ALL))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_READS_NORMAL) + SUM(BYPASS_IMC_TAKEN) + SUM(IMC_WRITES_ALL))*64.0/time +Memory data volume [GBytes] = 1.0E-09*(SUM(IMC_READS_NORMAL) + SUM(BYPASS_IMC_TAKEN) + SUM(IMC_WRITES_ALL))*64.0 +- +This group derives the same metrics as the MEM group but use the events of the +Home Agent, a central unit that is responsible for the protocol side of memory +interactions. diff --git a/collectors/likwid/groups/broadwellEP/ICACHE.txt b/collectors/likwid/groups/broadwellEP/ICACHE.txt new file mode 100644 index 0000000..5f11ad6 --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/ICACHE.txt @@ -0,0 +1,25 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ICACHE_ACCESSES +PMC1 ICACHE_MISSES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1I request rate PMC0/FIXC0 +L1I miss rate PMC1/FIXC0 +L1I miss ratio PMC1/PMC0 + +LONG +Formulas: +L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY +L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY +L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES +- +This group measures some L1 instruction cache metrics. diff --git a/collectors/likwid/groups/broadwellEP/L2.txt b/collectors/likwid/groups/broadwellEP/L2.txt new file mode 100644 index 0000000..60c7f79 --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/L2.txt @@ -0,0 +1,37 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPLACEMENT +PMC1 L2_TRANS_L1D_WB +PMC2 ICACHE_MISSES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2D load data volume [GBytes] 1.0E-09*PMC0*64.0 +L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time +L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0 + +LONG +Formulas: +L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time +L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0 +L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time +L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0 +L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0/time +L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0 +- +Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the +number of cache line loaded from the L2 to the L2 data cache and the writebacks from +the L2 data cache to the L2 cache. The group also outputs total data volume transferred between +L2 and L1. Note that this bandwidth also includes data transfers due to a write +allocate load on a store miss in L1 and cache lines transferred it the instruction +cache. diff --git a/collectors/likwid/groups/broadwellEP/L2CACHE.txt b/collectors/likwid/groups/broadwellEP/L2CACHE.txt new file mode 100644 index 0000000..9b5dd4b --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/L2CACHE.txt @@ -0,0 +1,34 @@ +SHORT L2 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_TRANS_ALL_REQUESTS +PMC1 L2_RQSTS_MISS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 request rate PMC0/FIXC0 +L2 miss rate PMC1/FIXC0 +L2 miss ratio PMC1/PMC0 + +LONG +Formulas: +L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY +L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY +L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS +- +This group measures the locality of your data accesses with regard to the +L2 cache. L2 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L2 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L2 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the# data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/broadwellEP/L3.txt b/collectors/likwid/groups/broadwellEP/L3.txt new file mode 100644 index 0000000..98d1d9e --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/L3.txt @@ -0,0 +1,36 @@ +SHORT L3 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_LINES_IN_ALL +PMC1 L2_TRANS_L2_WB + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L3 load data volume [GBytes] 1.0E-09*PMC0*64.0 +L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 + +LONG +Formulas: +L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time +L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0 +L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time +L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0 +L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time +L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64 +- +Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L2 and the number of modified cache lines +evicted from the L2. This group also output data volume transferred between the +L3 and measured cores L2 caches. Note that this bandwidth also includes data +transfers due to a write allocate load on a store miss in L2. + diff --git a/collectors/likwid/groups/broadwellEP/L3CACHE.txt b/collectors/likwid/groups/broadwellEP/L3CACHE.txt new file mode 100644 index 0000000..f863daa --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/L3CACHE.txt @@ -0,0 +1,35 @@ +SHORT L3 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_LOAD_UOPS_RETIRED_L3_ALL +PMC1 MEM_LOAD_UOPS_RETIRED_L3_MISS +PMC2 UOPS_RETIRED_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 request rate PMC0/PMC2 +L3 miss rate PMC1/PMC2 +L3 miss ratio PMC1/PMC0 + +LONG +Formulas: +L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL +L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL +L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL +- +This group measures the locality of your data accesses with regard to the +L3 cache. L3 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L3 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L3 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the# data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/broadwellEP/MEM.txt b/collectors/likwid/groups/broadwellEP/MEM.txt new file mode 100644 index 0000000..2a17a2c --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/MEM.txt @@ -0,0 +1,52 @@ +SHORT Main memory bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +MBOX0C0 CAS_COUNT_RD +MBOX0C1 CAS_COUNT_WR +MBOX1C0 CAS_COUNT_RD +MBOX1C1 CAS_COUNT_WR +MBOX2C0 CAS_COUNT_RD +MBOX2C1 CAS_COUNT_WR +MBOX3C0 CAS_COUNT_RD +MBOX3C1 CAS_COUNT_WR +MBOX4C0 CAS_COUNT_RD +MBOX4C1 CAS_COUNT_WR +MBOX5C0 CAS_COUNT_RD +MBOX5C1 CAS_COUNT_WR +MBOX6C0 CAS_COUNT_RD +MBOX6C1 CAS_COUNT_WR +MBOX7C0 CAS_COUNT_RD +MBOX7C1 CAS_COUNT_WR + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 + +LONG +Formulas: +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime +Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime +Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime +Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0 +- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Since this group is based on Uncore events it is only possible to measure on a +per socket base. Some of the counters may not be available on your system. +Also outputs total data volume transferred from main memory. +The same metrics are provided by the HA group. + diff --git a/collectors/likwid/groups/broadwellEP/MEM_DP.txt b/collectors/likwid/groups/broadwellEP/MEM_DP.txt new file mode 100644 index 0000000..6078d57 --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/MEM_DP.txt @@ -0,0 +1,73 @@ +SHORT Overview of arithmetic and main memory performance + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY +PWR3 PWR_DRAM_ENERGY +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE +MBOX0C0 CAS_COUNT_RD +MBOX0C1 CAS_COUNT_WR +MBOX1C0 CAS_COUNT_RD +MBOX1C1 CAS_COUNT_WR +MBOX2C0 CAS_COUNT_RD +MBOX2C1 CAS_COUNT_WR +MBOX3C0 CAS_COUNT_RD +MBOX3C1 CAS_COUNT_WR +MBOX4C0 CAS_COUNT_RD +MBOX4C1 CAS_COUNT_WR +MBOX5C0 CAS_COUNT_RD +MBOX5C1 CAS_COUNT_WR +MBOX6C0 CAS_COUNT_RD +MBOX6C1 CAS_COUNT_WR +MBOX7C0 CAS_COUNT_RD +MBOX7C1 CAS_COUNT_WR + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time +MFLOP/s 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time +AVX [MFLOP/s] 1.0E-06*(PMC2*4.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 +Operational intensity (PMC0*2.0+PMC1+PMC2*4.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0) + +LONG +Formulas: +Power [W] = PWR_PKG_ENERGY/runtime +Power DRAM [W] = PWR_DRAM_ENERGY/runtime +MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime +AVX [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime +Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime +Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime +Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0 +Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0) +-- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Since this group is based on Uncore events it is only possible to measure on +a per socket base. Also outputs total data volume transferred from main memory. +SSE scalar and packed double precision FLOP rates. Also reports on packed AVX +32b instructions. +The operational intensity is calculated using the FP values of the cores and the +memory data volume of the whole socket. The actual operational intensity for +multiple CPUs can be found in the statistics table in the Sum column. diff --git a/collectors/likwid/groups/broadwellEP/MEM_SP.txt b/collectors/likwid/groups/broadwellEP/MEM_SP.txt new file mode 100644 index 0000000..d18d2ab --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/MEM_SP.txt @@ -0,0 +1,73 @@ +SHORT Overview of arithmetic and main memory performance + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY +PWR3 PWR_DRAM_ENERGY +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE +MBOX0C0 CAS_COUNT_RD +MBOX0C1 CAS_COUNT_WR +MBOX1C0 CAS_COUNT_RD +MBOX1C1 CAS_COUNT_WR +MBOX2C0 CAS_COUNT_RD +MBOX2C1 CAS_COUNT_WR +MBOX3C0 CAS_COUNT_RD +MBOX3C1 CAS_COUNT_WR +MBOX4C0 CAS_COUNT_RD +MBOX4C1 CAS_COUNT_WR +MBOX5C0 CAS_COUNT_RD +MBOX5C1 CAS_COUNT_WR +MBOX6C0 CAS_COUNT_RD +MBOX6C1 CAS_COUNT_WR +MBOX7C0 CAS_COUNT_RD +MBOX7C1 CAS_COUNT_WR + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time +MFLOP/s 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time +AVX [MFLOP/s] 1.0E-06*(PMC2*8.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 +Operational intensity (PMC0*4.0+PMC1+PMC2*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0) + +LONG +Formulas: +Power [W] = PWR_PKG_ENERGY/runtime +Power DRAM [W] = PWR_DRAM_ENERGY/runtime +MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime +AVX [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime +Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime +Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime +Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0 +Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0) +-- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Since this group is based on Uncore events it is only possible to measure on +a per socket base. Also outputs total data volume transferred from main memory. +SSE scalar and packed single precision FLOP rates. Also reports on packed AVX +32b instructions. +The operational intensity is calculated using the FP values of the cores and the +memory data volume of the whole socket. The actual operational intensity for +multiple CPUs can be found in the statistics table in the Sum column. diff --git a/collectors/likwid/groups/broadwellEP/NUMA.txt b/collectors/likwid/groups/broadwellEP/NUMA.txt new file mode 100644 index 0000000..5b30e25 --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/NUMA.txt @@ -0,0 +1,41 @@ +SHORT Local and remote data transfers + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +BBOX0C0 REQUESTS_READS_LOCAL +BBOX1C0 REQUESTS_READS_LOCAL +BBOX0C1 REQUESTS_READS_REMOTE +BBOX1C1 REQUESTS_READS_REMOTE +BBOX0C2 REQUESTS_WRITES_LOCAL +BBOX1C2 REQUESTS_WRITES_LOCAL +BBOX0C3 REQUESTS_WRITES_REMOTE +BBOX1C3 REQUESTS_WRITES_REMOTE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Local bandwidth [MByte/s] 1.E-06*((BBOX0C0+BBOX1C0+BBOX0C2+BBOX1C2)*64)/time +Local data volume [GByte] 1.E-09*(BBOX0C0+BBOX1C0+BBOX0C2+BBOX1C2)*64 +Remote bandwidth [MByte/s] 1.E-06*((BBOX0C1+BBOX1C1+BBOX0C3+BBOX1C3)*64)/time +Remote data volume [GByte] 1.E-09*(BBOX0C1+BBOX1C1+BBOX0C3+BBOX1C3)*64 +Total bandwidth [MByte/s] 1.E-06*((BBOX0C0+BBOX1C0+BBOX0C2+BBOX1C2+BBOX0C1+BBOX1C1+BBOX0C3+BBOX1C3)*64)/time +Total data volume [GByte] 1.E-09*(BBOX0C0+BBOX1C0+BBOX0C2+BBOX1C2+BBOX0C1+BBOX1C1+BBOX0C3+BBOX1C3)*64 + + +LONG +Formulas: +CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY +Local bandwidth [MByte/s] = 1.E-06*((SUM(REQUESTS_READS_LOCAL)+SUM(REQUESTS_WRITES_LOCAL))*64)/time +Local data volume [GByte] = 1.E-09*(SUM(REQUESTS_READS_LOCAL)+SUM(REQUESTS_WRITES_LOCAL))*64 +Remote bandwidth [MByte/s] = 1.E-06*((SUM(REQUESTS_READS_REMOTE)+SUM(REQUESTS_WRITES_REMOTE))*64)/time +Remote data volume [GByte] = 1.E-09*(SUM(REQUESTS_READS_REMOTE)+SUM(REQUESTS_WRITES_REMOTE))*64 +Total bandwidth [MByte/s] = 1.E-06*((SUM(REQUESTS_READS_LOCAL)+SUM(REQUESTS_WRITES_LOCAL)+SUM(REQUESTS_READS_REMOTE)+SUM(REQUESTS_WRITES_REMOTE))*64)/time +Total data volume [GByte] = 1.E-09*(SUM(REQUESTS_READS_LOCAL)+SUM(REQUESTS_WRITES_LOCAL)+SUM(REQUESTS_READS_REMOTE)+SUM(REQUESTS_WRITES_REMOTE))*64 +-- +This performance group measures the data traffic of CPU sockets to local and remote +CPU sockets. It uses the Home Agent for calculation. This may include also data from +other sources than the memory controllers. diff --git a/collectors/likwid/groups/broadwellEP/PORT_USAGE.txt b/collectors/likwid/groups/broadwellEP/PORT_USAGE.txt new file mode 100644 index 0000000..298df1d --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/PORT_USAGE.txt @@ -0,0 +1,50 @@ +SHORT Execution port utilization + +REQUIRE_NOHT + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_EXECUTED_PORT_PORT_0 +PMC1 UOPS_EXECUTED_PORT_PORT_1 +PMC2 UOPS_EXECUTED_PORT_PORT_2 +PMC3 UOPS_EXECUTED_PORT_PORT_3 +PMC4 UOPS_EXECUTED_PORT_PORT_4 +PMC5 UOPS_EXECUTED_PORT_PORT_5 +PMC6 UOPS_EXECUTED_PORT_PORT_6 +PMC7 UOPS_EXECUTED_PORT_PORT_7 + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Port0 usage ratio PMC0/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port1 usage ratio PMC1/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port2 usage ratio PMC2/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port3 usage ratio PMC3/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port4 usage ratio PMC4/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port5 usage ratio PMC5/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port6 usage ratio PMC6/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port7 usage ratio PMC7/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) + +LONG +Formulas: +Port0 usage ratio = UOPS_EXECUTED_PORT_PORT_0/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port1 usage ratio = UOPS_EXECUTED_PORT_PORT_1/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port2 usage ratio = UOPS_EXECUTED_PORT_PORT_2/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port3 usage ratio = UOPS_EXECUTED_PORT_PORT_3/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port4 usage ratio = UOPS_EXECUTED_PORT_PORT_4/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port5 usage ratio = UOPS_EXECUTED_PORT_PORT_5/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port6 usage ratio = UOPS_EXECUTED_PORT_PORT_6/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port7 usage ratio = UOPS_EXECUTED_PORT_PORT_7/SUM(UOPS_EXECUTED_PORT_PORT_*) +- +This group measures the execution port utilization in a CPU core. The group can +only be measured when HyperThreading is disabled because only then each CPU core +can program eight counters. +Please be aware that the counters PMC4-7 are broken on Intel Broadwell. They +don't increment if either user- or kernel-level filtering is applied. User-level +filtering is default in LIKWID, hence kernel-level filtering is added +automatically for PMC4-7. The returned counts can be much higher. diff --git a/collectors/likwid/groups/broadwellEP/QPI.txt b/collectors/likwid/groups/broadwellEP/QPI.txt new file mode 100644 index 0000000..8594706 --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/QPI.txt @@ -0,0 +1,49 @@ +SHORT QPI Link Layer data + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +QBOX0C0 RXL_FLITS_G0_DATA +QBOX1C0 RXL_FLITS_G0_DATA +QBOX0C1 RXL_FLITS_G0_NON_DATA +QBOX1C1 RXL_FLITS_G0_NON_DATA +QBOX0C2 TXL_FLITS_G0_DATA +QBOX1C2 TXL_FLITS_G0_DATA +QBOX0C3 TXL_FLITS_G0_NON_DATA +QBOX1C3 TXL_FLITS_G0_NON_DATA + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +QPI send data volume [GByte] 1.E-09*(QBOX0C2+QBOX1C2)*8 +QPI send data bandwidth [MByte/s] 1.E-06*(QBOX0C2+QBOX1C2)*8/time +QPI send link volume [GByte] 1.E-09*(QBOX0C2+QBOX1C2+QBOX0C3+QBOX1C3)*8 +QPI send link bandwidth [MByte/s] 1.E-06*(QBOX0C2+QBOX1C2+QBOX0C3+QBOX1C3)*8/time +QPI receive data volume [GByte] 1.E-09*(QBOX0C0+QBOX1C0)*8 +QPI receive data bandwidth [MByte/s] 1.E-06*(QBOX0C0+QBOX1C0)*8/time +QPI receive link volume [GByte] 1.E-09*(QBOX0C0+QBOX1C0+QBOX0C1+QBOX1C1)*8 +QPI receive link bandwidth [MByte/s] 1.E-06*(QBOX0C0+QBOX1C0+QBOX0C1+QBOX1C1)*8/time +QPI total transfer volume [GByte] 1.E-09*(QBOX0C0+QBOX1C0+QBOX0C2+QBOX1C2+QBOX0C1+QBOX1C1+QBOX0C3+QBOX1C3)*8 +QPI total bandwidth [MByte/s] 1.E-06*(QBOX0C0+QBOX1C0+QBOX0C2+QBOX1C2+QBOX0C1+QBOX1C1+QBOX0C3+QBOX1C3)*8/time + +LONG +Formulas: +QPI send data volume [GByte] = 1.E-09*(sum(TXL_FLITS_G0_DATA)*8) +QPI send data bandwidth [MByte/s] = 1.E-06*(sum(TXL_FLITS_G0_DATA)*8)/runtime +QPI send link volume [GByte] = 1.E-09*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8) +QPI send link bandwidth [MByte/s] = 1.E-06*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8)/runtime +QPI receive data volume [GByte] = 1.E-09*(sum(RXL_FLITS_G0_DATA)*8) +QPI receive data bandwidth [MByte/s] = 1.E-06*(sum(RXL_FLITS_G0_DATA)*8)/runtime +QPI receive link volume [GByte] = 1.E-09*((sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8) +QPI receive link bandwidth [MByte/s] = 1.E-06*((sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8)/runtime +QPI total transfer volume [GByte] = 1.E-09*(sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA)+sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8 +QPI total bandwidth [MByte/s] = 1.E-06*(sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA)+sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8/time +-- +The Intel QPI Link Layer is responsible for packetizing requests from the caching agent (CBOXes) +on the way out to the system interface. For Broadwell EP systems, the Link Layer and the +Ring interface is separated. The QPI link volume contains header, data and trailer while the +QPI data volume counts only the data flits. diff --git a/collectors/likwid/groups/broadwellEP/TLB_DATA.txt b/collectors/likwid/groups/broadwellEP/TLB_DATA.txt new file mode 100644 index 0000000..54f5e05 --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/TLB_DATA.txt @@ -0,0 +1,35 @@ +SHORT L2 data TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 DTLB_LOAD_MISSES_CAUSES_A_WALK +PMC1 DTLB_STORE_MISSES_CAUSES_A_WALK +PMC2 DTLB_LOAD_MISSES_WALK_DURATION +PMC3 DTLB_STORE_MISSES_WALK_DURATION + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 DTLB load misses PMC0 +L1 DTLB load miss rate PMC0/FIXC0 +L1 DTLB load miss duration PMC2 +L1 DTLB store misses PMC1 +L1 DTLB store miss rate PMC1/FIXC0 +L1 DTLB store miss duration PMC3 + +LONG +Formulas: +L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK +L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 DTLB load miss duration = DTLB_LOAD_MISSES_WALK_DURATION +L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK +L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 DTLB store miss duration = DTLB_STORE_MISSES_WALK_DURATION +- +The DTLB load and store miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/broadwellEP/TLB_INSTR.txt b/collectors/likwid/groups/broadwellEP/TLB_INSTR.txt new file mode 100644 index 0000000..647748f --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/TLB_INSTR.txt @@ -0,0 +1,28 @@ +SHORT L1 Instruction TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ITLB_MISSES_CAUSES_A_WALK +PMC1 ITLB_MISSES_WALK_DURATION + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 ITLB misses PMC0 +L1 ITLB miss rate PMC0/FIXC0 +L1 ITLB miss duration PMC1 + + +LONG +Formulas: +L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK +L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 ITLB miss duration = ITLB_MISSES_WALK_DURATION +- +The ITLB miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/broadwellEP/TMA.txt b/collectors/likwid/groups/broadwellEP/TMA.txt new file mode 100644 index 0000000..afb4126 --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/TMA.txt @@ -0,0 +1,48 @@ +SHORT Top down cycle allocation + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_ANY +PMC1 UOPS_RETIRED_RETIRE_SLOTS +PMC2 IDQ_UOPS_NOT_DELIVERED_CORE +PMC3 INT_MISC_RECOVERY_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +IPC FIXC0/FIXC1 +Total Slots 4*FIXC1 +Slots Retired PMC1 +Fetch Bubbles PMC2 +Recovery Bubbles 4*PMC3 +Front End [%] PMC2/(4*FIXC1)*100 +Speculation [%] (PMC0-PMC1+(4*PMC3))/(4*FIXC1)*100 +Retiring [%] PMC1/(4*FIXC1)*100 +Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100 + +LONG +Formulas: +Total Slots = 4*CPU_CLK_UNHALTED_CORE +Slots Retired = UOPS_RETIRED_RETIRE_SLOTS +Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE +Recovery Bubbles = 4*INT_MISC_RECOVERY_CYCLES +Front End [%] = IDQ_UOPS_NOT_DELIVERED_CORE/(4*CPU_CLK_UNHALTED_CORE)*100 +Speculation [%] = (UOPS_ISSUED_ANY-UOPS_RETIRED_RETIRE_SLOTS+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)*100 +Retiring [%] = UOPS_RETIRED_RETIRE_SLOTS/(4*CPU_CLK_UNHALTED_CORE)*100 +Back End [%] = (1-((IDQ_UOPS_NOT_DELIVERED_CORE+UOPS_ISSUED_ANY+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)))*100 +-- +This performance group measures cycles to determine percentage of time spent in +front end, back end, retiring and speculation. These metrics are published and +verified by Intel. Further information: +Webpage describing Top-Down Method and its usage in Intel vTune: +https://software.intel.com/en-us/vtune-amplifier-help-tuning-applications-using-a-top-down-microarchitecture-analysis-method +Paper by Yasin Ahmad: +https://sites.google.com/site/analysismethods/yasin-pubs/TopDown-Yasin-ISPASS14.pdf?attredirects=0 +Slides by Yasin Ahmad: +http://www.cs.technion.ac.il/~erangi/TMA_using_Linux_perf__Ahmad_Yasin.pdf +The performance group was originally published here: +http://perf.mvermeulen.com/2018/04/14/top-down-performance-counter-analysis-part-1-likwid/ diff --git a/collectors/likwid/groups/broadwellEP/UOPS.txt b/collectors/likwid/groups/broadwellEP/UOPS.txt new file mode 100644 index 0000000..e6cc208 --- /dev/null +++ b/collectors/likwid/groups/broadwellEP/UOPS.txt @@ -0,0 +1,35 @@ +SHORT UOPs execution info + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_ANY +PMC1 UOPS_EXECUTED_THREAD +PMC2 UOPS_RETIRED_ALL +PMC3 UOPS_ISSUED_FLAGS_MERGE + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Issued UOPs PMC0 +Merged UOPs PMC3 +Executed UOPs PMC1 +Retired UOPs PMC2 + +LONG +Formulas: +Issued UOPs = UOPS_ISSUED_ANY +Merged UOPs = UOPS_ISSUED_FLAGS_MERGE +Executed UOPs = UOPS_EXECUTED_THREAD +Retired UOPs = UOPS_RETIRED_ALL +- +This group returns information about the instruction pipeline. It measures the +issued, executed and retired uOPs and returns the number of uOPs which were issued +but not executed as well as the number of uOPs which were executed but never retired. +The executed but not retired uOPs commonly come from speculatively executed branches. + diff --git a/collectors/likwid/groups/core2/BRANCH.txt b/collectors/likwid/groups/core2/BRANCH.txt new file mode 100644 index 0000000..3c66c00 --- /dev/null +++ b/collectors/likwid/groups/core2/BRANCH.txt @@ -0,0 +1,30 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 BR_INST_RETIRED_ANY +PMC1 BR_INST_RETIRED_MISPRED + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Branch rate PMC0/FIXC0 +Branch misprediction rate PMC1/FIXC0 +Branch misprediction ratio PMC1/PMC0 +Instructions per branch FIXC0/PMC0 + +LONG +Formulas: +Branch rate = BR_INST_RETIRED_ANY/INSTR_RETIRED_ANY +Branch misprediction rate = BR_INST_RETIRED_MISPRED/INSTR_RETIRED_ANY +Branch misprediction ratio = BR_INST_RETIRED_MISPRED/BR_INST_RETIRED_ANY +Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ANY +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. diff --git a/collectors/likwid/groups/core2/CACHE.txt b/collectors/likwid/groups/core2/CACHE.txt new file mode 100644 index 0000000..6eda059 --- /dev/null +++ b/collectors/likwid/groups/core2/CACHE.txt @@ -0,0 +1,34 @@ +SHORT Data cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPL +PMC1 L1D_ALL_REF + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +CPI FIXC1/FIXC0 +data cache misses PMC0 +data cache request rate PMC1/FIXC0 +data cache miss rate PMC0/FIXC0 +data cache miss ratio PMC0/PMC1 + +LONG +Formulas: +data cache request rate = L1D_ALL_REF / INSTR_RETIRED_ANY +data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY +data cache miss ratio = L1D_REPL / L1D_ALL_REF +- +This group measures the locality of your data accesses with regard to the +L1 cache. Data cache request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The data cache miss rate gives a measure how often it was necessary to get +cache lines from higher levels of the memory hierarchy. And finally +data cache miss ratio tells you how many of your memory references required +a cache line to be loaded from a higher level. While the# data cache miss rate +might be given by your algorithm you should try to get data cache miss ratio +as low as possible by increasing your cache reuse. + diff --git a/collectors/likwid/groups/core2/CLOCK.txt b/collectors/likwid/groups/core2/CLOCK.txt new file mode 100644 index 0000000..871c4f9 --- /dev/null +++ b/collectors/likwid/groups/core2/CLOCK.txt @@ -0,0 +1,19 @@ +SHORT CPU clock information + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 + +LONG +Formulas: +CPI = CPU_CLK_UNHALTED_CORE / INSTR_RETIRED_ANY +- +Most basic performance group measuring the the clock frequency of the machine. + diff --git a/collectors/likwid/groups/core2/DATA.txt b/collectors/likwid/groups/core2/DATA.txt new file mode 100644 index 0000000..0f5bca5 --- /dev/null +++ b/collectors/likwid/groups/core2/DATA.txt @@ -0,0 +1,22 @@ +SHORT Load to store ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 INST_RETIRED_LOADS +PMC1 INST_RETIRED_STORES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Load to store ratio PMC0/PMC1 + +LONG +Formulas: +Load to store ratio = INST_RETIRED_LOADS/INST_RETIRED_STORES +- +This is a simple metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/core2/DIVIDE.txt b/collectors/likwid/groups/core2/DIVIDE.txt new file mode 100644 index 0000000..0753b4e --- /dev/null +++ b/collectors/likwid/groups/core2/DIVIDE.txt @@ -0,0 +1,24 @@ +SHORT Divide unit information + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLES_DIV_BUSY +PMC1 DIV + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Number of divide ops PMC1 +Avg. divide unit usage duration PMC0/PMC1 + +LONG +Formulas: +Number of divide ops = DIV +Avg. divide unit usage duration = CYCLES_DIV_BUSY/DIV +- +This performance group measures the average latency of divide operations diff --git a/collectors/likwid/groups/core2/FLOPS_DP.txt b/collectors/likwid/groups/core2/FLOPS_DP.txt new file mode 100644 index 0000000..e1698ff --- /dev/null +++ b/collectors/likwid/groups/core2/FLOPS_DP.txt @@ -0,0 +1,29 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 SIMD_COMP_INST_RETIRED_PACKED_DOUBLE +PMC1 SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +CPI FIXC1/FIXC0 +DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1)/time +Packed [MUOPS/s] 1.0E-06*PMC0/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*PMC0/PMC1 + +LONG +Formulas: +DP [MFLOP/s] = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_DOUBLE*2+SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE)/time +Packed [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_PACKED_DOUBLE/runtime +Scalar [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE/runtime +Vectorization ratio = 100*SIMD_COMP_INST_RETIRED_PACKED_DOUBLE/SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE +- +Profiling group to measure double SSE FLOPs. Don't forget that your code might also execute X87 FLOPs. +On the number of SIMD_COMP_INST_RETIRED_PACKED_DOUBLE you can see how well your code was vectorized. + + diff --git a/collectors/likwid/groups/core2/FLOPS_SP.txt b/collectors/likwid/groups/core2/FLOPS_SP.txt new file mode 100644 index 0000000..a2c842c --- /dev/null +++ b/collectors/likwid/groups/core2/FLOPS_SP.txt @@ -0,0 +1,29 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 SIMD_COMP_INST_RETIRED_PACKED_SINGLE +PMC1 SIMD_COMP_INST_RETIRED_SCALAR_SINGLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +CPI FIXC1/FIXC0 +SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1)/time +Packed [MUOPS/s] 1.0E-06*PMC0/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*PMC0/PMC1 + +LONG +Formulas: +SP [MFLOP/s] = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_SINGLE*4+SIMD_COMP_INST_RETIRED_SCALAR_SINGLE)/time +Packed [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_PACKED_SINGLE/runtime +Scalar [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_SCALAR_SINGLE/runtime +Vectorization ratio [%] = 100*SIMD_COMP_INST_RETIRED_PACKED_SINGLE/SIMD_COMP_INST_RETIRED_SCALAR_SINGLE +- +Profiling group to measure single precision SSE FLOPs. Don't forget that your code might also execute X87 FLOPs. +On the number of SIMD_COMP_INST_RETIRED_PACKED_SINGLE you can see how well your code was vectorized. + + diff --git a/collectors/likwid/groups/core2/FLOPS_X87.txt b/collectors/likwid/groups/core2/FLOPS_X87.txt new file mode 100644 index 0000000..46309e4 --- /dev/null +++ b/collectors/likwid/groups/core2/FLOPS_X87.txt @@ -0,0 +1,21 @@ +SHORT X87 MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 X87_OPS_RETIRED_ANY + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +CPI FIXC1/FIXC0 +X87 [MFLOP/s] 1.0E-06*PMC0/time + +LONG +Formulas: +X87 [MFLOP/s] = 1.0E-06*X87_OPS_RETIRED_ANY/time +- +Profiling group to measure X87 FLOPs. Note that also non computational operations +are measured by this event. + diff --git a/collectors/likwid/groups/core2/L2.txt b/collectors/likwid/groups/core2/L2.txt new file mode 100644 index 0000000..d8cbe0d --- /dev/null +++ b/collectors/likwid/groups/core2/L2.txt @@ -0,0 +1,35 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPL +PMC1 L1D_M_EVICT + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +CPI FIXC1/FIXC0 +L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2D load data volume [GBytes] 1.0E-09*PMC0*64.0 +L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 + +LONG +Formulas: +L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPL*64.0/time +L2D load data volume [GBytes] = 1.0E-09*L1D_REPL*64.0 +L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time +L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0 +L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT)*64/time +L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT)*64.0 +- +Profiling group to measure L2 cache bandwidth. The bandwidth is +computed by the number of cache line allocated in the L1 and the +number of modified cache lines evicted from the L1. +Note that this bandwidth also includes data transfers due to a +write allocate load on a store miss in L1. + diff --git a/collectors/likwid/groups/core2/L2CACHE.txt b/collectors/likwid/groups/core2/L2CACHE.txt new file mode 100644 index 0000000..d3b8776 --- /dev/null +++ b/collectors/likwid/groups/core2/L2CACHE.txt @@ -0,0 +1,34 @@ +SHORT L2 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_RQSTS_THIS_CORE_ALL_MESI +PMC1 L2_RQSTS_SELF_I_STATE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 request rate PMC0/FIXC0 +L2 miss rate PMC1/FIXC0 +L2 miss ratio PMC1/PMC0 + +LONG +Formulas: +L2 request rate = L2_RQSTS_THIS_CORE_ALL_MESI / INSTR_RETIRED_ANY +L2 miss rate = L2_RQSTS_SELF_I_STATE / INSTR_RETIRED_ANY +L2 miss ratio = L2_RQSTS_SELF_I_STATE / L2_RQSTS_THIS_CORE_ALL_MESI +- +This group measures the locality of your data accesses with regard to the +L2 cache. L2 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L2 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L2 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the# data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/core2/MEM.txt b/collectors/likwid/groups/core2/MEM.txt new file mode 100644 index 0000000..f6522ba --- /dev/null +++ b/collectors/likwid/groups/core2/MEM.txt @@ -0,0 +1,23 @@ +SHORT Main memory bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 BUS_TRANS_MEM_THIS_CORE_THIS_A +PMC1 BUS_TRANS_WB_THIS_CORE_ALL_A + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Memory bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 + +LONG +Formulas: +Memory bandwidth [MBytes/s] = 1.0E-06*BUS_TRANS_MEM_THIS_CORE_THIS_A*64/time +Memory data volume [GBytes] = 1.0E-09*BUS_TRANS_MEM_THIS_CORE_THIS_A*64.0 +- +Profiling group to measure memory bandwidth drawn by this core. diff --git a/collectors/likwid/groups/core2/TLB.txt b/collectors/likwid/groups/core2/TLB.txt new file mode 100644 index 0000000..a46cc4b --- /dev/null +++ b/collectors/likwid/groups/core2/TLB.txt @@ -0,0 +1,29 @@ +SHORT TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 DTLB_MISSES_ANY +PMC1 L1D_ALL_REF + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +CPI FIXC1/FIXC0 +L1 DTLB request rate PMC1/FIXC0 +DTLB miss rate PMC0/FIXC0 +L1 DTLB miss ratio PMC0/PMC1 + +LONG +Formulas: +L1 DTLB request rate = L1D_ALL_REF / INSTR_RETIRED_ANY +DTLB miss rate = DTLB_MISSES_ANY / INSTR_RETIRED_ANY +L1 DTLB miss ratio = DTLB_MISSES_ANY / L1D_ALL_REF +- +L1 DTLB request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The DTLB miss rate gives a measure how often a TLB miss occurred +per instruction. And finally L1 DTLB miss ratio tells you how many +of your memory references required caused a TLB miss on average. + diff --git a/collectors/likwid/groups/core2/UOPS.txt b/collectors/likwid/groups/core2/UOPS.txt new file mode 100644 index 0000000..5d816d8 --- /dev/null +++ b/collectors/likwid/groups/core2/UOPS.txt @@ -0,0 +1,26 @@ +SHORT UOPs execution info + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 RS_UOPS_DISPATCHED_ALL +PMC1 UOPS_RETIRED_ANY + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Executed UOPs PMC0 +Retired UOPs PMC1 + +LONG +Formulas: +Executed UOPs = RS_UOPS_DISPATCHED_ALL +Retired UOPs = UOPS_RETIRED_ANY +- +Performance group measures the executed and retired micro ops. The difference +between executed and retired uOPs are the speculatively executed uOPs. diff --git a/collectors/likwid/groups/core2/UOPS_RETIRE.txt b/collectors/likwid/groups/core2/UOPS_RETIRE.txt new file mode 100644 index 0000000..be0bf73 --- /dev/null +++ b/collectors/likwid/groups/core2/UOPS_RETIRE.txt @@ -0,0 +1,25 @@ +SHORT UOPs retirement + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_RETIRED_USED_CYCLES +PMC1 UOPS_RETIRED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio PMC0/FIXC1 +Unused cycles ratio PMC1/FIXC1 + + +LONG +Formulas: +Used cycles ratio = UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio = UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +- +This performance group returns the ratios of used and unused CPU cycles. Here +unused cycles are cycles where no operation is performed due to some stall. diff --git a/collectors/likwid/groups/goldmont/BRANCH.txt b/collectors/likwid/groups/goldmont/BRANCH.txt new file mode 100644 index 0000000..b8d41b2 --- /dev/null +++ b/collectors/likwid/groups/goldmont/BRANCH.txt @@ -0,0 +1,31 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 BR_INST_RETIRED_ALL_BRANCHES +PMC1 BR_MISP_RETIRED_ALL_BRANCHES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Branch rate PMC0/FIXC0 +Branch misprediction rate PMC1/FIXC0 +Branch misprediction ratio PMC1/PMC0 +Instructions per branch FIXC0/PMC0 + +LONG +Formulas: +Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES +Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/goldmont/CLOCK.txt b/collectors/likwid/groups/goldmont/CLOCK.txt new file mode 100644 index 0000000..b2174c8 --- /dev/null +++ b/collectors/likwid/groups/goldmont/CLOCK.txt @@ -0,0 +1,23 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +- +Silvermont implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) level. + diff --git a/collectors/likwid/groups/goldmont/DATA.txt b/collectors/likwid/groups/goldmont/DATA.txt new file mode 100644 index 0000000..61a915b --- /dev/null +++ b/collectors/likwid/groups/goldmont/DATA.txt @@ -0,0 +1,22 @@ +SHORT Load to store ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_UOPS_RETIRED_ALL_LOADS +PMC1 MEM_UOPS_RETIRED_ALL_STORES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Load to store ratio PMC0/PMC1 + +LONG +Formulas: +Load to store ratio = MEM_UOPS_RETIRED_ALL_LOADS/MEM_UOPS_RETIRED_ALL_STORES +- +This is a metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/goldmont/DIVIDE.txt b/collectors/likwid/groups/goldmont/DIVIDE.txt new file mode 100644 index 0000000..9fc6702 --- /dev/null +++ b/collectors/likwid/groups/goldmont/DIVIDE.txt @@ -0,0 +1,24 @@ +SHORT Divide unit information + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLES_DIV_BUSY_ALL +PMC1 CYCLES_DIV_BUSY_ALL_COUNT + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Number of divide ops PMC1 +Avg. divide unit usage duration PMC0/PMC1 + +LONG +Formulas: +Number of divide ops = CYCLES_DIV_BUSY_ALL_COUNT +Avg. divide unit usage duration = CYCLES_DIV_BUSY_ALL/CYCLES_DIV_BUSY_ALL_COUNT +- +This performance group measures the average latency of divide operations diff --git a/collectors/likwid/groups/goldmont/ENERGY.txt b/collectors/likwid/groups/goldmont/ENERGY.txt new file mode 100644 index 0000000..7770534 --- /dev/null +++ b/collectors/likwid/groups/goldmont/ENERGY.txt @@ -0,0 +1,33 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +TMP0 TEMP_CORE +PWR0 PWR_PKG_ENERGY +PWR1 PWR_PP0_ENERGY +PWR3 PWR_DRAM_ENERGY + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Temperature [C] TMP0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy PP0 [J] PWR1 +Power PP0 [W] PWR1/time +Energy DRAM [J] PWR1 +Power DRAM [W] PWR1/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Power PP0 = PWR_PP0_ENERGY / time +Power DRAM = PWR_DRAM_ENERGY / time +- +Goldmont implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) level. + diff --git a/collectors/likwid/groups/goldmont/ICACHE.txt b/collectors/likwid/groups/goldmont/ICACHE.txt new file mode 100644 index 0000000..5f11ad6 --- /dev/null +++ b/collectors/likwid/groups/goldmont/ICACHE.txt @@ -0,0 +1,25 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ICACHE_ACCESSES +PMC1 ICACHE_MISSES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1I request rate PMC0/FIXC0 +L1I miss rate PMC1/FIXC0 +L1I miss ratio PMC1/PMC0 + +LONG +Formulas: +L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY +L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY +L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES +- +This group measures some L1 instruction cache metrics. diff --git a/collectors/likwid/groups/goldmont/L2CACHE.txt b/collectors/likwid/groups/goldmont/L2CACHE.txt new file mode 100644 index 0000000..32a1545 --- /dev/null +++ b/collectors/likwid/groups/goldmont/L2CACHE.txt @@ -0,0 +1,34 @@ +SHORT L2 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 LONGEST_LAT_CACHE_REFERENCE +PMC1 LONGEST_LAT_CACHE_MISS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 request rate PMC0/FIXC0 +L2 miss rate PMC1/FIXC0 +L2 miss ratio PMC1/PMC0 + +LONG +Formulas: +L2 request rate = LONGEST_LAT_CACHE_REFERENCE/INSTR_RETIRED_ANY +L2 miss rate = LONGEST_LAT_CACHE_MISS/INSTR_RETIRED_ANY +L2 miss ratio = LONGEST_LAT_CACHE_MISS/LONGEST_LAT_CACHE_REFERENCE +- +This group measures the locality of your data accesses with regard to the +L2 cache. L2 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L2 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L2 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache +reuse. + diff --git a/collectors/likwid/groups/goldmont/TLB_DATA.txt b/collectors/likwid/groups/goldmont/TLB_DATA.txt new file mode 100644 index 0000000..b4679e5 --- /dev/null +++ b/collectors/likwid/groups/goldmont/TLB_DATA.txt @@ -0,0 +1,27 @@ +SHORT L2 data TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 PAGE_WALKS_D_SIDE_COUNT +PMC1 PAGE_WALKS_D_SIDE_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 DTLB misses PMC0 +L1 DTLB miss rate PMC0/FIXC0 +L1 DTLB miss duration [Cyc] PMC1/PMC0 + +LONG +Formulas: +L1 DTLB misses = PAGE_WALKS_D_SIDE_COUNT +L1 DTLB miss rate = PAGE_WALKS_D_SIDE_COUNT / INSTR_RETIRED_ANY +L1 DTLB miss duration [Cyc] = PAGE_WALKS_D_SIDE_CYCLES / PAGE_WALKS_D_SIDE_COUNT +- +The DTLB load and store miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/goldmont/TLB_INSTR.txt b/collectors/likwid/groups/goldmont/TLB_INSTR.txt new file mode 100644 index 0000000..30dce1e --- /dev/null +++ b/collectors/likwid/groups/goldmont/TLB_INSTR.txt @@ -0,0 +1,27 @@ +SHORT L1 Instruction TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 PAGE_WALKS_I_SIDE_COUNT +PMC1 PAGE_WALKS_I_SIDE_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 ITLB misses PMC0 +L1 ITLB miss rate PMC0/FIXC0 +L1 ITLB miss duration [Cyc] PMC1/PMC0 + + +LONG +Formulas: +L1 ITLB misses = PAGE_WALKS_I_SIDE_COUNT +L1 ITLB miss rate = PAGE_WALKS_I_SIDE_COUNT / INSTR_RETIRED_ANY +L1 ITLB miss duration [Cyc] = PAGE_WALKS_I_SIDE_CYCLES / PAGE_WALKS_I_SIDE_COUNT +- +The ITLB miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. diff --git a/collectors/likwid/groups/haswell/BRANCH.txt b/collectors/likwid/groups/haswell/BRANCH.txt new file mode 100644 index 0000000..b8d41b2 --- /dev/null +++ b/collectors/likwid/groups/haswell/BRANCH.txt @@ -0,0 +1,31 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 BR_INST_RETIRED_ALL_BRANCHES +PMC1 BR_MISP_RETIRED_ALL_BRANCHES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Branch rate PMC0/FIXC0 +Branch misprediction rate PMC1/FIXC0 +Branch misprediction ratio PMC1/PMC0 +Instructions per branch FIXC0/PMC0 + +LONG +Formulas: +Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES +Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/haswell/CACHES.txt b/collectors/likwid/groups/haswell/CACHES.txt new file mode 100644 index 0000000..e39e861 --- /dev/null +++ b/collectors/likwid/groups/haswell/CACHES.txt @@ -0,0 +1,71 @@ +SHORT Cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPLACEMENT +PMC1 L1D_M_EVICT +PMC2 L2_LINES_IN_ALL +PMC3 L2_TRANS_L2_WB +CBOX0C0 CACHE_LOOKUP_READ_MESI +CBOX1C0 CACHE_LOOKUP_READ_MESI +CBOX2C0 CACHE_LOOKUP_READ_MESI +CBOX3C0 CACHE_LOOKUP_READ_MESI +CBOX0C1 CACHE_LOOKUP_WRITE_MESI +CBOX1C1 CACHE_LOOKUP_WRITE_MESI +CBOX2C1 CACHE_LOOKUP_WRITE_MESI +CBOX3C1 CACHE_LOOKUP_WRITE_MESI + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0 +L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 +L3 to L2 load bandwidth [MBytes/s] 1.0E-06*PMC2*64.0/time +L3 to L2 load data volume [GBytes] 1.0E-09*PMC2*64.0 +L2 to L3 evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time +L2 to L3 evict data volume [GBytes] 1.0E-09*PMC3*64.0 +L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time +L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0 +System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0)*64.0/time +System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0)*64.0 +L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1)*64/time +L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1)*64 +L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1)*64.0/time +L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1)*64.0 + +LONG +Formulas: +L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time +L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64 +L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time +L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64 +L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time +L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64 +L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time +L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64 +L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time +L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64 +L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time +L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64 +System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(CACHE_LOOKUP_READ_MESI))*64/time +System to L3 data volume [GBytes] = 1.0E-09*(SUM(CACHE_LOOKUP_READ_MESI))*64 +L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(CACHE_LOOKUP_WRITE_MESI))*64/time +L3 to system data volume [GBytes] = 1.0E-09*(SUM(CACHE_LOOKUP_WRITE_MESI))*64 +L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(CACHE_LOOKUP_READ_MESI)+SUM(CACHE_LOOKUP_WRITE_MESI))*64/time +L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(CACHE_LOOKUP_READ_MESI)+SUM(CACHE_LOOKUP_WRITE_MESI))*64 +- +Group to measure cache transfers between L1 and Memory. Please notice that the +L3 to/from system metrics contain any traffic to the system (memory, +Intel QPI, etc.) but don't seem to handle anything because commonly memory read +bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth. + diff --git a/collectors/likwid/groups/haswell/CLOCK.txt b/collectors/likwid/groups/haswell/CLOCK.txt new file mode 100644 index 0000000..8055d5b --- /dev/null +++ b/collectors/likwid/groups/haswell/CLOCK.txt @@ -0,0 +1,26 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY +UBOXFIX UNCORE_CLOCK + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +Uncore Clock [MHz] 1.E-06*UBOXFIX/time +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time +- +Haswell implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) level. + diff --git a/collectors/likwid/groups/haswell/CYCLE_ACTIVITY.txt b/collectors/likwid/groups/haswell/CYCLE_ACTIVITY.txt new file mode 100644 index 0000000..c432a44 --- /dev/null +++ b/collectors/likwid/groups/haswell/CYCLE_ACTIVITY.txt @@ -0,0 +1,38 @@ +SHORT Cycle Activities + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING +PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING +PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING +PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Cycles without execution [%] (PMC3/FIXC1)*100 +Cycles without execution due to L1D [%] (PMC2/FIXC1)*100 +Cycles without execution due to L2 [%] (PMC0/FIXC1)*100 +Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100 + +LONG +Formulas: +Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100 +Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100 +Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100 +Cycles without execution due to memory loads [%] = CYCLE_ACTIVITY_CYCLES_LDM_PENDING/CPU_CLK_UNHALTED_CORE*100 +-- +This performance group measures the cycles while waiting for data from the cache +and memory hierarchy. +CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on +any execution port. +CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is +outstanding. +CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is +outstanding. +CYCLE_ACTIVITY_CYCLES_LDM_PENDING: Cycles while memory subsystem has an +outstanding load. diff --git a/collectors/likwid/groups/haswell/CYCLE_STALLS.txt b/collectors/likwid/groups/haswell/CYCLE_STALLS.txt new file mode 100644 index 0000000..795aeb9 --- /dev/null +++ b/collectors/likwid/groups/haswell/CYCLE_STALLS.txt @@ -0,0 +1,45 @@ +SHORT Cycle Activities (Stalls) + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING +PMC1 CYCLE_ACTIVITY_STALLS_LDM_PENDING +PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING +PMC3 CYCLE_ACTIVITY_STALLS_TOTAL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Total execution stalls PMC3 +Stalls caused by L1D misses [%] (PMC2/PMC3)*100 +Stalls caused by L2 misses [%] (PMC0/PMC3)*100 +Stalls caused by memory loads [%] (PMC1/PMC3)*100 +Execution stall rate [%] (PMC3/FIXC1)*100 +Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100 +Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100 +Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100 + +LONG +Formulas: +Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL +Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Stalls caused by memory loads [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by memory loads rate [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CPU_CLK_UNHALTED_CORE)*100 +-- +This performance group measures the stalls caused by data traffic in the cache +hierarchy. +CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls. +CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand +load is outstanding. +CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand +load is outstanding. +CYCLE_ACTIVITY_STALLS_LDM_PENDING: Execution stalls while memory subsystem has +an outstanding load. diff --git a/collectors/likwid/groups/haswell/DATA.txt b/collectors/likwid/groups/haswell/DATA.txt new file mode 100644 index 0000000..17948d4 --- /dev/null +++ b/collectors/likwid/groups/haswell/DATA.txt @@ -0,0 +1,27 @@ +SHORT Load to store ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_UOPS_RETIRED_LOADS +PMC1 MEM_UOPS_RETIRED_STORES +PMC2 UOPS_RETIRED_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Load to store ratio PMC0/PMC1 +Load ratio PMC0/PMC2 +Store ratio PMC1/PMC2 + +LONG +Formulas: +Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES +Load ratio = MEM_UOPS_RETIRED_LOADS/UOPS_RETIRED_ALL +Store ratio = MEM_UOPS_RETIRED_STORES/UOPS_RETIRED_ALL +- +This is a metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/haswell/DIVIDE.txt b/collectors/likwid/groups/haswell/DIVIDE.txt new file mode 100644 index 0000000..c9690cf --- /dev/null +++ b/collectors/likwid/groups/haswell/DIVIDE.txt @@ -0,0 +1,24 @@ +SHORT Divide unit information + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ARITH_DIVIDER_UOPS +PMC1 ARITH_DIVIDER_CYCLES + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Number of divide ops PMC0 +Avg. divide unit usage duration PMC1/PMC0 + +LONG +Formulas: +Number of divide ops = ARITH_DIVIDER_UOPS +Avg. divide unit usage duration = ARITH_DIVIDER_CYCLES/ARITH_DIVIDER_UOPS +- +This performance group measures the average latency of divide operations diff --git a/collectors/likwid/groups/haswell/ENERGY.txt b/collectors/likwid/groups/haswell/ENERGY.txt new file mode 100644 index 0000000..59242db --- /dev/null +++ b/collectors/likwid/groups/haswell/ENERGY.txt @@ -0,0 +1,39 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +TMP0 TEMP_CORE +PWR0 PWR_PKG_ENERGY +PWR1 PWR_PP0_ENERGY +PWR2 PWR_PP1_ENERGY +PWR3 PWR_DRAM_ENERGY + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Temperature [C] TMP0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy PP0 [J] PWR1 +Power PP0 [W] PWR1/time +Energy PP1 [J] PWR2 +Power PP1 [W] PWR2/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Power PP0 = PWR_PP0_ENERGY / time +Power PP1 = PWR_PP1_ENERGY / time +Power DRAM = PWR_DRAM_ENERGY / time +- +Haswell implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) and DRAM level. + diff --git a/collectors/likwid/groups/haswell/FALSE_SHARE.txt b/collectors/likwid/groups/haswell/FALSE_SHARE.txt new file mode 100644 index 0000000..db438a3 --- /dev/null +++ b/collectors/likwid/groups/haswell/FALSE_SHARE.txt @@ -0,0 +1,27 @@ +SHORT False sharing + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM +PMC2 MEM_LOAD_UOPS_RETIRED_ALL_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Local LLC hit with false sharing [MByte] 1.E-06*PMC0*64 +Local LLC hit with false sharing rate PMC0/PMC2 + +LONG +Formulas: +Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM*64 +Local LLC false sharing rate = MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL +- +False-sharing of cache lines can dramatically reduce the performance of an +application. This performance group measures the L3 traffic induced by false-sharing. +The false-sharing rate uses all memory loads as reference. +Please keep in mind that the MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM event may +undercount by as much as 40% (Errata HSD25). diff --git a/collectors/likwid/groups/haswell/FLOPS_AVX.txt b/collectors/likwid/groups/haswell/FLOPS_AVX.txt new file mode 100644 index 0000000..15aacb8 --- /dev/null +++ b/collectors/likwid/groups/haswell/FLOPS_AVX.txt @@ -0,0 +1,28 @@ +SHORT Packed AVX MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 AVX_INSTS_CALC + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Packed SP [MFLOP/s] 1.0E-06*(PMC0*8.0)/time +Packed DP [MFLOP/s] 1.0E-06*(PMC0*4.0)/time + +LONG +Formulas: +Packed SP [MFLOP/s] = 1.0E-06*(AVX_INSTS_CALC*8)/runtime +Packed DP [MFLOP/s] = 1.0E-06*(AVX_INSTS_CALC*4)/runtime +- +Packed 32b AVX FLOP/s rates. Approximate counts of AVX & AVX2 256-bit instructions. +May count non-AVX instructions that employ 256-bit operations, including (but +not necessarily limited to) rep string instructions that use 256-bit loads and +stores for optimized performance, XSAVE* and XRSTOR*, and operations that +transition the x87 FPU data registers between x87 and MMX. +Caution: The event AVX_INSTS_CALC counts the insertf128 instruction often used +by the Intel C compilers for (unaligned) vector loads. diff --git a/collectors/likwid/groups/haswell/ICACHE.txt b/collectors/likwid/groups/haswell/ICACHE.txt new file mode 100644 index 0000000..f1e2335 --- /dev/null +++ b/collectors/likwid/groups/haswell/ICACHE.txt @@ -0,0 +1,33 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ICACHE_ACCESSES +PMC1 ICACHE_MISSES +PMC2 ICACHE_IFETCH_STALL +PMC3 ILD_STALL_IQ_FULL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1I request rate PMC0/FIXC0 +L1I miss rate PMC1/FIXC0 +L1I miss ratio PMC1/PMC0 +L1I stalls PMC2 +L1I stall rate PMC2/FIXC0 +L1I queue full stalls PMC3 +L1I queue full stall rate PMC3/FIXC0 + +LONG +Formulas: +L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY +L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY +L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES +L1I stalls = ICACHE_IFETCH_STALL +L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY +- +This group measures some L1 instruction cache metrics. diff --git a/collectors/likwid/groups/haswell/L2.txt b/collectors/likwid/groups/haswell/L2.txt new file mode 100644 index 0000000..60c7f79 --- /dev/null +++ b/collectors/likwid/groups/haswell/L2.txt @@ -0,0 +1,37 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPLACEMENT +PMC1 L2_TRANS_L1D_WB +PMC2 ICACHE_MISSES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2D load data volume [GBytes] 1.0E-09*PMC0*64.0 +L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time +L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0 + +LONG +Formulas: +L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time +L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0 +L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time +L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0 +L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0/time +L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0 +- +Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the +number of cache line loaded from the L2 to the L2 data cache and the writebacks from +the L2 data cache to the L2 cache. The group also outputs total data volume transferred between +L2 and L1. Note that this bandwidth also includes data transfers due to a write +allocate load on a store miss in L1 and cache lines transferred it the instruction +cache. diff --git a/collectors/likwid/groups/haswell/L2CACHE.txt b/collectors/likwid/groups/haswell/L2CACHE.txt new file mode 100644 index 0000000..9b5dd4b --- /dev/null +++ b/collectors/likwid/groups/haswell/L2CACHE.txt @@ -0,0 +1,34 @@ +SHORT L2 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_TRANS_ALL_REQUESTS +PMC1 L2_RQSTS_MISS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 request rate PMC0/FIXC0 +L2 miss rate PMC1/FIXC0 +L2 miss ratio PMC1/PMC0 + +LONG +Formulas: +L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY +L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY +L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS +- +This group measures the locality of your data accesses with regard to the +L2 cache. L2 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L2 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L2 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the# data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/haswell/L3.txt b/collectors/likwid/groups/haswell/L3.txt new file mode 100644 index 0000000..f63a918 --- /dev/null +++ b/collectors/likwid/groups/haswell/L3.txt @@ -0,0 +1,36 @@ +SHORT L3 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_LINES_IN_ALL +PMC1 L2_TRANS_L2_WB + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L3 load data volume [GBytes] 1.0E-09*PMC0*64.0 +L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 + +LONG +Formulas: +L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time +L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0 +L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time +L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0 +L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time +L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64 +- +Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L2 and the number of modified cache lines +evicted from the L2. This group also output data volume transferred between the +L3 and measured cores L2 caches. Note that this bandwidth also includes data +transfers due to a write allocate load on a store miss in L2. + diff --git a/collectors/likwid/groups/haswell/L3CACHE.txt b/collectors/likwid/groups/haswell/L3CACHE.txt new file mode 100644 index 0000000..f863daa --- /dev/null +++ b/collectors/likwid/groups/haswell/L3CACHE.txt @@ -0,0 +1,35 @@ +SHORT L3 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_LOAD_UOPS_RETIRED_L3_ALL +PMC1 MEM_LOAD_UOPS_RETIRED_L3_MISS +PMC2 UOPS_RETIRED_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 request rate PMC0/PMC2 +L3 miss rate PMC1/PMC2 +L3 miss ratio PMC1/PMC0 + +LONG +Formulas: +L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL +L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL +L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL +- +This group measures the locality of your data accesses with regard to the +L3 cache. L3 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L3 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L3 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the# data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/haswell/MEM.txt b/collectors/likwid/groups/haswell/MEM.txt new file mode 100644 index 0000000..3a12df7 --- /dev/null +++ b/collectors/likwid/groups/haswell/MEM.txt @@ -0,0 +1,36 @@ +SHORT L3 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +MBOX0C1 DRAM_READS +MBOX0C2 DRAM_WRITES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Memory load bandwidth [MBytes/s] 1.0E-06*MBOX0C1*64.0/time +Memory load data volume [GBytes] 1.0E-09*MBOX0C1*64.0 +Memory evict bandwidth [MBytes/s] 1.0E-06*MBOX0C2*64.0/time +Memory evict data volume [GBytes] 1.0E-09*MBOX0C2*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX0C2)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX0C2)*64.0 + +LONG +Formulas: +L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time +L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0 +L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time +L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0 +L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time +L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64 +- +Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L2 and the number of modified cache lines +evicted from the L2. This group also output data volume transferred between the +L3 and measured cores L2 caches. Note that this bandwidth also includes data +transfers due to a write allocate load on a store miss in L2. + diff --git a/collectors/likwid/groups/haswell/PORT_USAGE.txt b/collectors/likwid/groups/haswell/PORT_USAGE.txt new file mode 100644 index 0000000..eb74ffe --- /dev/null +++ b/collectors/likwid/groups/haswell/PORT_USAGE.txt @@ -0,0 +1,46 @@ +SHORT Execution port utilization + +REQUIRE_NOHT + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_EXECUTED_PORT_PORT_0 +PMC1 UOPS_EXECUTED_PORT_PORT_1 +PMC2 UOPS_EXECUTED_PORT_PORT_2 +PMC3 UOPS_EXECUTED_PORT_PORT_3 +PMC4 UOPS_EXECUTED_PORT_PORT_4 +PMC5 UOPS_EXECUTED_PORT_PORT_5 +PMC6 UOPS_EXECUTED_PORT_PORT_6 +PMC7 UOPS_EXECUTED_PORT_PORT_7 + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Port0 usage ratio PMC0/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port1 usage ratio PMC1/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port2 usage ratio PMC2/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port3 usage ratio PMC3/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port4 usage ratio PMC4/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port5 usage ratio PMC5/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port6 usage ratio PMC6/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port7 usage ratio PMC7/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) + +LONG +Formulas: +Port0 usage ratio = UOPS_EXECUTED_PORT_PORT_0/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port1 usage ratio = UOPS_EXECUTED_PORT_PORT_1/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port2 usage ratio = UOPS_EXECUTED_PORT_PORT_2/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port3 usage ratio = UOPS_EXECUTED_PORT_PORT_3/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port4 usage ratio = UOPS_EXECUTED_PORT_PORT_4/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port5 usage ratio = UOPS_EXECUTED_PORT_PORT_5/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port6 usage ratio = UOPS_EXECUTED_PORT_PORT_6/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port7 usage ratio = UOPS_EXECUTED_PORT_PORT_7/SUM(UOPS_EXECUTED_PORT_PORT_*) +- +This group measures the execution port utilization in a CPU core. The group can +only be measured when HyperThreading is disabled because only then each CPU core +can program eight counters. diff --git a/collectors/likwid/groups/haswell/RECOVERY.txt b/collectors/likwid/groups/haswell/RECOVERY.txt new file mode 100644 index 0000000..7928ee4 --- /dev/null +++ b/collectors/likwid/groups/haswell/RECOVERY.txt @@ -0,0 +1,22 @@ +SHORT Recovery duration + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 INT_MISC_RECOVERY_CYCLES +PMC1 INT_MISC_RECOVERY_COUNT + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Avg. recovery duration PMC0/PMC1 + +LONG +Formulas: +Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT +- +This group measures the duration of recoveries after SSE exception, memory +disambiguation, etc... diff --git a/collectors/likwid/groups/haswell/TLB_DATA.txt b/collectors/likwid/groups/haswell/TLB_DATA.txt new file mode 100644 index 0000000..8d94e05 --- /dev/null +++ b/collectors/likwid/groups/haswell/TLB_DATA.txt @@ -0,0 +1,35 @@ +SHORT L2 data TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 DTLB_LOAD_MISSES_CAUSES_A_WALK +PMC1 DTLB_STORE_MISSES_CAUSES_A_WALK +PMC2 DTLB_LOAD_MISSES_WALK_DURATION +PMC3 DTLB_STORE_MISSES_WALK_DURATION + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 DTLB load misses PMC0 +L1 DTLB load miss rate PMC0/FIXC0 +L1 DTLB load miss duration [Cyc] PMC2/PMC0 +L1 DTLB store misses PMC1 +L1 DTLB store miss rate PMC1/FIXC0 +L1 DTLB store miss duration [Cyc] PMC3/PMC1 + +LONG +Formulas: +L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK +L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK +L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK +L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK +- +The DTLB load and store miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/haswell/TLB_INSTR.txt b/collectors/likwid/groups/haswell/TLB_INSTR.txt new file mode 100644 index 0000000..235d977 --- /dev/null +++ b/collectors/likwid/groups/haswell/TLB_INSTR.txt @@ -0,0 +1,28 @@ +SHORT L1 Instruction TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ITLB_MISSES_CAUSES_A_WALK +PMC1 ITLB_MISSES_WALK_DURATION + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 ITLB misses PMC0 +L1 ITLB miss rate PMC0/FIXC0 +L1 ITLB miss duration [Cyc] PMC1/PMC0 + + +LONG +Formulas: +L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK +L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK +- +The ITLB miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/haswell/TMA.txt b/collectors/likwid/groups/haswell/TMA.txt new file mode 100644 index 0000000..afb4126 --- /dev/null +++ b/collectors/likwid/groups/haswell/TMA.txt @@ -0,0 +1,48 @@ +SHORT Top down cycle allocation + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_ANY +PMC1 UOPS_RETIRED_RETIRE_SLOTS +PMC2 IDQ_UOPS_NOT_DELIVERED_CORE +PMC3 INT_MISC_RECOVERY_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +IPC FIXC0/FIXC1 +Total Slots 4*FIXC1 +Slots Retired PMC1 +Fetch Bubbles PMC2 +Recovery Bubbles 4*PMC3 +Front End [%] PMC2/(4*FIXC1)*100 +Speculation [%] (PMC0-PMC1+(4*PMC3))/(4*FIXC1)*100 +Retiring [%] PMC1/(4*FIXC1)*100 +Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100 + +LONG +Formulas: +Total Slots = 4*CPU_CLK_UNHALTED_CORE +Slots Retired = UOPS_RETIRED_RETIRE_SLOTS +Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE +Recovery Bubbles = 4*INT_MISC_RECOVERY_CYCLES +Front End [%] = IDQ_UOPS_NOT_DELIVERED_CORE/(4*CPU_CLK_UNHALTED_CORE)*100 +Speculation [%] = (UOPS_ISSUED_ANY-UOPS_RETIRED_RETIRE_SLOTS+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)*100 +Retiring [%] = UOPS_RETIRED_RETIRE_SLOTS/(4*CPU_CLK_UNHALTED_CORE)*100 +Back End [%] = (1-((IDQ_UOPS_NOT_DELIVERED_CORE+UOPS_ISSUED_ANY+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)))*100 +-- +This performance group measures cycles to determine percentage of time spent in +front end, back end, retiring and speculation. These metrics are published and +verified by Intel. Further information: +Webpage describing Top-Down Method and its usage in Intel vTune: +https://software.intel.com/en-us/vtune-amplifier-help-tuning-applications-using-a-top-down-microarchitecture-analysis-method +Paper by Yasin Ahmad: +https://sites.google.com/site/analysismethods/yasin-pubs/TopDown-Yasin-ISPASS14.pdf?attredirects=0 +Slides by Yasin Ahmad: +http://www.cs.technion.ac.il/~erangi/TMA_using_Linux_perf__Ahmad_Yasin.pdf +The performance group was originally published here: +http://perf.mvermeulen.com/2018/04/14/top-down-performance-counter-analysis-part-1-likwid/ diff --git a/collectors/likwid/groups/haswell/UOPS.txt b/collectors/likwid/groups/haswell/UOPS.txt new file mode 100644 index 0000000..e6cc208 --- /dev/null +++ b/collectors/likwid/groups/haswell/UOPS.txt @@ -0,0 +1,35 @@ +SHORT UOPs execution info + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_ANY +PMC1 UOPS_EXECUTED_THREAD +PMC2 UOPS_RETIRED_ALL +PMC3 UOPS_ISSUED_FLAGS_MERGE + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Issued UOPs PMC0 +Merged UOPs PMC3 +Executed UOPs PMC1 +Retired UOPs PMC2 + +LONG +Formulas: +Issued UOPs = UOPS_ISSUED_ANY +Merged UOPs = UOPS_ISSUED_FLAGS_MERGE +Executed UOPs = UOPS_EXECUTED_THREAD +Retired UOPs = UOPS_RETIRED_ALL +- +This group returns information about the instruction pipeline. It measures the +issued, executed and retired uOPs and returns the number of uOPs which were issued +but not executed as well as the number of uOPs which were executed but never retired. +The executed but not retired uOPs commonly come from speculatively executed branches. + diff --git a/collectors/likwid/groups/haswell/UOPS_EXEC.txt b/collectors/likwid/groups/haswell/UOPS_EXEC.txt new file mode 100644 index 0000000..7042df7 --- /dev/null +++ b/collectors/likwid/groups/haswell/UOPS_EXEC.txt @@ -0,0 +1,31 @@ +SHORT UOPs execution + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_EXECUTED_USED_CYCLES +PMC1 UOPS_EXECUTED_STALL_CYCLES +PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES +PMC3:EDGEDETECT UOPS_EXECUTED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio [%] 100*PMC0/PMC2 +Unused cycles ratio [%] 100*PMC1/PMC2 +Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT + + +LONG +Formulas: +Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT +- +This performance group returns the ratios of used and unused cycles regarding +the execution stage in the pipeline. Used cycles are all cycles where uOPs are +executed while unused cycles refer to pipeline stalls. Moreover, the group +calculates the average stall duration in cycles. diff --git a/collectors/likwid/groups/haswell/UOPS_ISSUE.txt b/collectors/likwid/groups/haswell/UOPS_ISSUE.txt new file mode 100644 index 0000000..9aac923 --- /dev/null +++ b/collectors/likwid/groups/haswell/UOPS_ISSUE.txt @@ -0,0 +1,31 @@ +SHORT UOPs issueing + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_USED_CYCLES +PMC1 UOPS_ISSUED_STALL_CYCLES +PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES +PMC3:EDGEDETECT UOPS_ISSUED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio [%] 100*PMC0/PMC2 +Unused cycles ratio [%] 100*PMC1/PMC2 +Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT + + +LONG +Formulas: +Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT +- +This performance group returns the ratios of used and unused cycles regarding +the issue stage in the pipeline. Used cycles are all cycles where uOPs are +issued while unused cycles refer to pipeline stalls. Moreover, the group +calculates the average stall duration in cycles. diff --git a/collectors/likwid/groups/haswell/UOPS_RETIRE.txt b/collectors/likwid/groups/haswell/UOPS_RETIRE.txt new file mode 100644 index 0000000..0f37585 --- /dev/null +++ b/collectors/likwid/groups/haswell/UOPS_RETIRE.txt @@ -0,0 +1,31 @@ +SHORT UOPs retirement + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_RETIRED_USED_CYCLES +PMC1 UOPS_RETIRED_STALL_CYCLES +PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES +PMC3:EDGEDETECT UOPS_RETIRED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio [%] 100*PMC0/PMC2 +Unused cycles ratio [%] 100*PMC1/PMC2 +Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT + + +LONG +Formulas: +Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT +- +This performance group returns the ratios of used and unused cycles regarding +the retirement stage in the pipeline (re-order buffer). Used cycles are all +cycles where uOPs are retired while unused cycles refer to pipeline stalls. +Moreover, the group calculates the average stall duration in cycles. diff --git a/collectors/likwid/groups/haswellEP/BRANCH.txt b/collectors/likwid/groups/haswellEP/BRANCH.txt new file mode 100644 index 0000000..b8d41b2 --- /dev/null +++ b/collectors/likwid/groups/haswellEP/BRANCH.txt @@ -0,0 +1,31 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 BR_INST_RETIRED_ALL_BRANCHES +PMC1 BR_MISP_RETIRED_ALL_BRANCHES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Branch rate PMC0/FIXC0 +Branch misprediction rate PMC1/FIXC0 +Branch misprediction ratio PMC1/PMC0 +Instructions per branch FIXC0/PMC0 + +LONG +Formulas: +Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES +Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/haswellEP/CACHES.txt b/collectors/likwid/groups/haswellEP/CACHES.txt new file mode 100644 index 0000000..295a139 --- /dev/null +++ b/collectors/likwid/groups/haswellEP/CACHES.txt @@ -0,0 +1,123 @@ +SHORT Cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPLACEMENT +PMC1 L1D_M_EVICT +PMC2 L2_LINES_IN_ALL +PMC3 L2_TRANS_L2_WB +CBOX0C0 LLC_LOOKUP_DATA_READ +CBOX1C0 LLC_LOOKUP_DATA_READ +CBOX2C0 LLC_LOOKUP_DATA_READ +CBOX3C0 LLC_LOOKUP_DATA_READ +CBOX4C0 LLC_LOOKUP_DATA_READ +CBOX5C0 LLC_LOOKUP_DATA_READ +CBOX6C0 LLC_LOOKUP_DATA_READ +CBOX7C0 LLC_LOOKUP_DATA_READ +CBOX8C0 LLC_LOOKUP_DATA_READ +CBOX9C0 LLC_LOOKUP_DATA_READ +CBOX10C0 LLC_LOOKUP_DATA_READ +CBOX11C0 LLC_LOOKUP_DATA_READ +CBOX12C0 LLC_LOOKUP_DATA_READ +CBOX13C0 LLC_LOOKUP_DATA_READ +CBOX14C0 LLC_LOOKUP_DATA_READ +CBOX15C0 LLC_LOOKUP_DATA_READ +CBOX16C0 LLC_LOOKUP_DATA_READ +CBOX17C0 LLC_LOOKUP_DATA_READ +CBOX0C1 LLC_VICTIMS_M +CBOX1C1 LLC_VICTIMS_M +CBOX2C1 LLC_VICTIMS_M +CBOX3C1 LLC_VICTIMS_M +CBOX4C1 LLC_VICTIMS_M +CBOX5C1 LLC_VICTIMS_M +CBOX6C1 LLC_VICTIMS_M +CBOX7C1 LLC_VICTIMS_M +CBOX8C1 LLC_VICTIMS_M +CBOX9C1 LLC_VICTIMS_M +CBOX10C1 LLC_VICTIMS_M +CBOX11C1 LLC_VICTIMS_M +CBOX12C1 LLC_VICTIMS_M +CBOX13C1 LLC_VICTIMS_M +MBOX0C0 CAS_COUNT_RD +MBOX0C1 CAS_COUNT_WR +MBOX1C0 CAS_COUNT_RD +MBOX1C1 CAS_COUNT_WR +MBOX2C0 CAS_COUNT_RD +MBOX2C1 CAS_COUNT_WR +MBOX3C0 CAS_COUNT_RD +MBOX3C1 CAS_COUNT_WR +MBOX4C0 CAS_COUNT_RD +MBOX4C1 CAS_COUNT_WR +MBOX5C0 CAS_COUNT_RD +MBOX5C1 CAS_COUNT_WR +MBOX6C0 CAS_COUNT_RD +MBOX6C1 CAS_COUNT_WR +MBOX7C0 CAS_COUNT_RD +MBOX7C1 CAS_COUNT_WR + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0 +L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 +L3 to L2 load bandwidth [MBytes/s] 1.0E-06*PMC2*64.0/time +L3 to L2 load data volume [GBytes] 1.0E-09*PMC2*64.0 +L2 to L3 evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time +L2 to L3 evict data volume [GBytes] 1.0E-09*PMC3*64.0 +L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time +L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0 +System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0)*64.0/time +System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0)*64.0 +L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64/time +L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64 +L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64.0/time +L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64.0 +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 + +LONG +Formulas: +L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time +L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64 +L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time +L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64 +L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time +L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64 +L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time +L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64 +L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time +L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64 +L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time +L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64 +System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ))*64/time +System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ))*64 +L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M))*64/time +L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M))*64 +L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M))*64/time +L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M))*64 +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time +Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time +Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time +Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0 +- +Group to measure cache transfers between L1 and Memory. Please notice that the +L3 to/from system metrics contain any traffic to the system (memory, +Intel QPI, etc.) but don't seem to handle anything because commonly memory read +bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth. + diff --git a/collectors/likwid/groups/haswellEP/CBOX.txt b/collectors/likwid/groups/haswellEP/CBOX.txt new file mode 100644 index 0000000..d9cc13c --- /dev/null +++ b/collectors/likwid/groups/haswellEP/CBOX.txt @@ -0,0 +1,61 @@ +SHORT CBOX related data and metrics + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +CBOX0C0 LLC_VICTIMS_M +CBOX1C0 LLC_VICTIMS_M +CBOX2C0 LLC_VICTIMS_M +CBOX3C0 LLC_VICTIMS_M +CBOX4C0 LLC_VICTIMS_M +CBOX5C0 LLC_VICTIMS_M +CBOX6C0 LLC_VICTIMS_M +CBOX7C0 LLC_VICTIMS_M +CBOX8C0 LLC_VICTIMS_M +CBOX9C0 LLC_VICTIMS_M +CBOX10C0 LLC_VICTIMS_M +CBOX11C0 LLC_VICTIMS_M +CBOX12C0 LLC_VICTIMS_M +CBOX13C0 LLC_VICTIMS_M +CBOX14C0 LLC_VICTIMS_M +CBOX15C0 LLC_VICTIMS_M +CBOX16C0 LLC_VICTIMS_M +CBOX17C0 LLC_VICTIMS_M +CBOX0C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX1C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX2C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX3C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX4C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX5C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX6C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX7C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX8C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX9C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX10C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX11C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX12C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX13C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX14C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX15C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX16C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX17C1:STATE=0x1 LLC_LOOKUP_ANY + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +LLC misses per instruction (CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0)/FIXC0 +LL2 data written to MEM [MBytes] 1E-6*(CBOX0C1:STATE=0x1+CBOX1C1:STATE=0x1+CBOX2C1:STATE=0x1+CBOX3C1:STATE=0x1+CBOX4C1:STATE=0x1+CBOX5C1:STATE=0x1+CBOX6C1:STATE=0x1+CBOX7C1:STATE=0x1+CBOX8C1:STATE=0x1+CBOX9C1:STATE=0x1+CBOX10C1:STATE=0x1+CBOX11C1:STATE=0x1+CBOX12C1:STATE=0x1+CBOX13C1:STATE=0x1+CBOX14C1:STATE=0x1+CBOX15C1:STATE=0x1+CBOX16C1:STATE=0x1+CBOX17C1:STATE=0x1)*64 + + +LONG +Formulas: +LLC Misses Per Instruction = sum(LLC_VICTIMS_M)/INSTR_RETIRED_ANY +LL2 data written to MEM [MBytes] = sum(LLC_LOOKUP_ANY)*64*1E-6 +- +The CBOXes mediate the traffic from the L2 cache to the segmented L3 cache. Each +CBOX is responsible for one segment (2.5 MByte). The boxes maintain the coherence between all +CPU cores of the socket. Depending on the CPU core count, some CBOXes are not attached +to a 2.5 MByte slice but are still active and track the traffic. diff --git a/collectors/likwid/groups/haswellEP/CLOCK.txt b/collectors/likwid/groups/haswellEP/CLOCK.txt new file mode 100644 index 0000000..8055d5b --- /dev/null +++ b/collectors/likwid/groups/haswellEP/CLOCK.txt @@ -0,0 +1,26 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY +UBOXFIX UNCORE_CLOCK + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +Uncore Clock [MHz] 1.E-06*UBOXFIX/time +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time +- +Haswell implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) level. + diff --git a/collectors/likwid/groups/haswellEP/CYCLE_ACTIVITY.txt b/collectors/likwid/groups/haswellEP/CYCLE_ACTIVITY.txt new file mode 100644 index 0000000..c432a44 --- /dev/null +++ b/collectors/likwid/groups/haswellEP/CYCLE_ACTIVITY.txt @@ -0,0 +1,38 @@ +SHORT Cycle Activities + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING +PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING +PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING +PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Cycles without execution [%] (PMC3/FIXC1)*100 +Cycles without execution due to L1D [%] (PMC2/FIXC1)*100 +Cycles without execution due to L2 [%] (PMC0/FIXC1)*100 +Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100 + +LONG +Formulas: +Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100 +Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100 +Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100 +Cycles without execution due to memory loads [%] = CYCLE_ACTIVITY_CYCLES_LDM_PENDING/CPU_CLK_UNHALTED_CORE*100 +-- +This performance group measures the cycles while waiting for data from the cache +and memory hierarchy. +CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on +any execution port. +CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is +outstanding. +CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is +outstanding. +CYCLE_ACTIVITY_CYCLES_LDM_PENDING: Cycles while memory subsystem has an +outstanding load. diff --git a/collectors/likwid/groups/haswellEP/CYCLE_STALLS.txt b/collectors/likwid/groups/haswellEP/CYCLE_STALLS.txt new file mode 100644 index 0000000..795aeb9 --- /dev/null +++ b/collectors/likwid/groups/haswellEP/CYCLE_STALLS.txt @@ -0,0 +1,45 @@ +SHORT Cycle Activities (Stalls) + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING +PMC1 CYCLE_ACTIVITY_STALLS_LDM_PENDING +PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING +PMC3 CYCLE_ACTIVITY_STALLS_TOTAL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Total execution stalls PMC3 +Stalls caused by L1D misses [%] (PMC2/PMC3)*100 +Stalls caused by L2 misses [%] (PMC0/PMC3)*100 +Stalls caused by memory loads [%] (PMC1/PMC3)*100 +Execution stall rate [%] (PMC3/FIXC1)*100 +Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100 +Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100 +Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100 + +LONG +Formulas: +Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL +Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Stalls caused by memory loads [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by memory loads rate [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CPU_CLK_UNHALTED_CORE)*100 +-- +This performance group measures the stalls caused by data traffic in the cache +hierarchy. +CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls. +CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand +load is outstanding. +CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand +load is outstanding. +CYCLE_ACTIVITY_STALLS_LDM_PENDING: Execution stalls while memory subsystem has +an outstanding load. diff --git a/collectors/likwid/groups/haswellEP/DATA.txt b/collectors/likwid/groups/haswellEP/DATA.txt new file mode 100644 index 0000000..967cbad --- /dev/null +++ b/collectors/likwid/groups/haswellEP/DATA.txt @@ -0,0 +1,22 @@ +SHORT Load to store ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_UOPS_RETIRED_LOADS +PMC1 MEM_UOPS_RETIRED_STORES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Load to store ratio PMC0/PMC1 + +LONG +Formulas: +Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES +- +This is a metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/haswellEP/DIVIDE.txt b/collectors/likwid/groups/haswellEP/DIVIDE.txt new file mode 100644 index 0000000..c9690cf --- /dev/null +++ b/collectors/likwid/groups/haswellEP/DIVIDE.txt @@ -0,0 +1,24 @@ +SHORT Divide unit information + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ARITH_DIVIDER_UOPS +PMC1 ARITH_DIVIDER_CYCLES + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Number of divide ops PMC0 +Avg. divide unit usage duration PMC1/PMC0 + +LONG +Formulas: +Number of divide ops = ARITH_DIVIDER_UOPS +Avg. divide unit usage duration = ARITH_DIVIDER_CYCLES/ARITH_DIVIDER_UOPS +- +This performance group measures the average latency of divide operations diff --git a/collectors/likwid/groups/haswellEP/ENERGY.txt b/collectors/likwid/groups/haswellEP/ENERGY.txt new file mode 100644 index 0000000..ee0af1b --- /dev/null +++ b/collectors/likwid/groups/haswellEP/ENERGY.txt @@ -0,0 +1,35 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +TMP0 TEMP_CORE +PWR0 PWR_PKG_ENERGY +PWR1 PWR_PP0_ENERGY +PWR3 PWR_DRAM_ENERGY + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Temperature [C] TMP0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy PP0 [J] PWR1 +Power PP0 [W] PWR1/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Power PP0 = PWR_PP0_ENERGY / time +Power DRAM = PWR_DRAM_ENERGY / time +- +Haswell implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) and DRAM level. + diff --git a/collectors/likwid/groups/haswellEP/FALSE_SHARE.txt b/collectors/likwid/groups/haswellEP/FALSE_SHARE.txt new file mode 100644 index 0000000..872dbc1 --- /dev/null +++ b/collectors/likwid/groups/haswellEP/FALSE_SHARE.txt @@ -0,0 +1,34 @@ +SHORT False sharing + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM +PMC1 MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM +PMC2 MEM_LOAD_UOPS_RETIRED_ALL_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Local LLC hit with false sharing [MByte] 1.E-06*PMC0*64 +Local LLC hit with false sharing rate PMC0/PMC2 +Remote LLC false sharing [MByte] 1.E-06*PMC1*64 +Remote LLC false sharing rate PMC1/PMC2 + +LONG +Formulas: +Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM*64 +Local LLC false sharing rate = MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL +Remote LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM*64 +Remote LLC false sharing rate = MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM/MEM_LOAD_UOPS_RETIRED_ALL +- +False-sharing of cache lines can dramatically reduce the performance of an +application. This performance group measures the L3 traffic induced by false-sharing. +The false-sharing rate uses all memory loads as reference. +For systems with multiple CPU sockets, this performance group also measures the +false-sharing of cache lines over socket boundaries. +Please keep in mind that the MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM event may +undercount by as much as 40% (Errata HSW150). diff --git a/collectors/likwid/groups/haswellEP/FLOPS_AVX.txt b/collectors/likwid/groups/haswellEP/FLOPS_AVX.txt new file mode 100644 index 0000000..15aacb8 --- /dev/null +++ b/collectors/likwid/groups/haswellEP/FLOPS_AVX.txt @@ -0,0 +1,28 @@ +SHORT Packed AVX MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 AVX_INSTS_CALC + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Packed SP [MFLOP/s] 1.0E-06*(PMC0*8.0)/time +Packed DP [MFLOP/s] 1.0E-06*(PMC0*4.0)/time + +LONG +Formulas: +Packed SP [MFLOP/s] = 1.0E-06*(AVX_INSTS_CALC*8)/runtime +Packed DP [MFLOP/s] = 1.0E-06*(AVX_INSTS_CALC*4)/runtime +- +Packed 32b AVX FLOP/s rates. Approximate counts of AVX & AVX2 256-bit instructions. +May count non-AVX instructions that employ 256-bit operations, including (but +not necessarily limited to) rep string instructions that use 256-bit loads and +stores for optimized performance, XSAVE* and XRSTOR*, and operations that +transition the x87 FPU data registers between x87 and MMX. +Caution: The event AVX_INSTS_CALC counts the insertf128 instruction often used +by the Intel C compilers for (unaligned) vector loads. diff --git a/collectors/likwid/groups/haswellEP/HA.txt b/collectors/likwid/groups/haswellEP/HA.txt new file mode 100644 index 0000000..1e5a700 --- /dev/null +++ b/collectors/likwid/groups/haswellEP/HA.txt @@ -0,0 +1,40 @@ +SHORT Main memory bandwidth in MBytes/s seen from Home agent + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +BBOX0C0 IMC_READS_NORMAL +BBOX0C1 BYPASS_IMC_TAKEN +BBOX0C2 IMC_WRITES_ALL +BBOX1C0 IMC_READS_NORMAL +BBOX1C1 BYPASS_IMC_TAKEN +BBOX1C2 IMC_WRITES_ALL + + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Memory read bandwidth [MBytes/s] 1.0E-06*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(BBOX0C2+BBOX1C2)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(BBOX0C2+BBOX1C2)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1+BBOX0C2+BBOX1C2)*64.0/time +Memory data volume [GBytes] 1.0E-09*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1+BBOX0C2+BBOX1C2)*64.0 + +LONG +Formulas: +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_READS_NORMAL)+SUM(BYPASS_IMC_TAKEN))*64.0/time +Memory read data volume [GBytes] = 1.0E-09*(SUM(IMC_READS_NORMAL)+SUM(BYPASS_IMC_TAKEN))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_WRITES_ALL))*64.0/time +Memory write data volume [GBytes] = 1.0E-09*(SUM(IMC_WRITES_ALL))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_READS_NORMAL) + SUM(BYPASS_IMC_TAKEN) + SUM(IMC_WRITES_ALL))*64.0/time +Memory data volume [GBytes] = 1.0E-09*(SUM(IMC_READS_NORMAL) + SUM(BYPASS_IMC_TAKEN) + SUM(IMC_WRITES_ALL))*64.0 +- +This group derives the same metrics as the MEM group but use the events of the +Home Agent, a central unit that is responsible for the protocol side of memory +interactions. diff --git a/collectors/likwid/groups/haswellEP/ICACHE.txt b/collectors/likwid/groups/haswellEP/ICACHE.txt new file mode 100644 index 0000000..f1e2335 --- /dev/null +++ b/collectors/likwid/groups/haswellEP/ICACHE.txt @@ -0,0 +1,33 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ICACHE_ACCESSES +PMC1 ICACHE_MISSES +PMC2 ICACHE_IFETCH_STALL +PMC3 ILD_STALL_IQ_FULL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1I request rate PMC0/FIXC0 +L1I miss rate PMC1/FIXC0 +L1I miss ratio PMC1/PMC0 +L1I stalls PMC2 +L1I stall rate PMC2/FIXC0 +L1I queue full stalls PMC3 +L1I queue full stall rate PMC3/FIXC0 + +LONG +Formulas: +L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY +L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY +L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES +L1I stalls = ICACHE_IFETCH_STALL +L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY +- +This group measures some L1 instruction cache metrics. diff --git a/collectors/likwid/groups/haswellEP/L2.txt b/collectors/likwid/groups/haswellEP/L2.txt new file mode 100644 index 0000000..60c7f79 --- /dev/null +++ b/collectors/likwid/groups/haswellEP/L2.txt @@ -0,0 +1,37 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPLACEMENT +PMC1 L2_TRANS_L1D_WB +PMC2 ICACHE_MISSES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2D load data volume [GBytes] 1.0E-09*PMC0*64.0 +L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time +L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0 + +LONG +Formulas: +L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time +L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0 +L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time +L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0 +L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0/time +L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0 +- +Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the +number of cache line loaded from the L2 to the L2 data cache and the writebacks from +the L2 data cache to the L2 cache. The group also outputs total data volume transferred between +L2 and L1. Note that this bandwidth also includes data transfers due to a write +allocate load on a store miss in L1 and cache lines transferred it the instruction +cache. diff --git a/collectors/likwid/groups/haswellEP/L2CACHE.txt b/collectors/likwid/groups/haswellEP/L2CACHE.txt new file mode 100644 index 0000000..9b5dd4b --- /dev/null +++ b/collectors/likwid/groups/haswellEP/L2CACHE.txt @@ -0,0 +1,34 @@ +SHORT L2 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_TRANS_ALL_REQUESTS +PMC1 L2_RQSTS_MISS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 request rate PMC0/FIXC0 +L2 miss rate PMC1/FIXC0 +L2 miss ratio PMC1/PMC0 + +LONG +Formulas: +L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY +L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY +L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS +- +This group measures the locality of your data accesses with regard to the +L2 cache. L2 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L2 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L2 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the# data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/haswellEP/L3.txt b/collectors/likwid/groups/haswellEP/L3.txt new file mode 100644 index 0000000..0109db3 --- /dev/null +++ b/collectors/likwid/groups/haswellEP/L3.txt @@ -0,0 +1,36 @@ +SHORT L3 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_LINES_IN_ALL +PMC1 L2_TRANS_L2_WB + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L3 load data volume [GBytes] 1.0E-09*PMC0*64.0 +L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 + +LONG +Formulas: +L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time +L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0 +L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64.0/time +L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DEMAND_DIRTY*64.0 +L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64/time +L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64 +- +Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L2 and the number of modified cache lines +evicted from the L2. This group also output data volume transferred between the +L3 and measured cores L2 caches. Note that this bandwidth also includes data +transfers due to a write allocate load on a store miss in L2. + diff --git a/collectors/likwid/groups/haswellEP/L3CACHE.txt b/collectors/likwid/groups/haswellEP/L3CACHE.txt new file mode 100644 index 0000000..f863daa --- /dev/null +++ b/collectors/likwid/groups/haswellEP/L3CACHE.txt @@ -0,0 +1,35 @@ +SHORT L3 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_LOAD_UOPS_RETIRED_L3_ALL +PMC1 MEM_LOAD_UOPS_RETIRED_L3_MISS +PMC2 UOPS_RETIRED_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 request rate PMC0/PMC2 +L3 miss rate PMC1/PMC2 +L3 miss ratio PMC1/PMC0 + +LONG +Formulas: +L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL +L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL +L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL +- +This group measures the locality of your data accesses with regard to the +L3 cache. L3 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L3 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L3 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the# data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/haswellEP/MEM.txt b/collectors/likwid/groups/haswellEP/MEM.txt new file mode 100644 index 0000000..2a17a2c --- /dev/null +++ b/collectors/likwid/groups/haswellEP/MEM.txt @@ -0,0 +1,52 @@ +SHORT Main memory bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +MBOX0C0 CAS_COUNT_RD +MBOX0C1 CAS_COUNT_WR +MBOX1C0 CAS_COUNT_RD +MBOX1C1 CAS_COUNT_WR +MBOX2C0 CAS_COUNT_RD +MBOX2C1 CAS_COUNT_WR +MBOX3C0 CAS_COUNT_RD +MBOX3C1 CAS_COUNT_WR +MBOX4C0 CAS_COUNT_RD +MBOX4C1 CAS_COUNT_WR +MBOX5C0 CAS_COUNT_RD +MBOX5C1 CAS_COUNT_WR +MBOX6C0 CAS_COUNT_RD +MBOX6C1 CAS_COUNT_WR +MBOX7C0 CAS_COUNT_RD +MBOX7C1 CAS_COUNT_WR + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 + +LONG +Formulas: +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime +Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime +Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime +Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0 +- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Since this group is based on Uncore events it is only possible to measure on a +per socket base. Some of the counters may not be available on your system. +Also outputs total data volume transferred from main memory. +The same metrics are provided by the HA group. + diff --git a/collectors/likwid/groups/haswellEP/NUMA.txt b/collectors/likwid/groups/haswellEP/NUMA.txt new file mode 100644 index 0000000..41fbe62 --- /dev/null +++ b/collectors/likwid/groups/haswellEP/NUMA.txt @@ -0,0 +1,33 @@ +SHORT Local and remote memory accesses + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 OFFCORE_RESPONSE_0_LOCAL_DRAM +PMC1 OFFCORE_RESPONSE_1_REMOTE_DRAM + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Local DRAM data volume [GByte] 1.E-09*PMC0*64 +Local DRAM bandwidth [MByte/s] 1.E-06*(PMC0*64)/time +Remote DRAM data volume [GByte] 1.E-09*PMC1*64 +Remote DRAM bandwidth [MByte/s] 1.E-06*(PMC1*64)/time +Memory data volume [GByte] 1.E-09*(PMC0+PMC1)*64 +Memory bandwidth [MByte/s] 1.E-06*((PMC0+PMC1)*64)/time + +LONG +Formulas: +CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY +Local DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_0_LOCAL_DRAM*64 +Local DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_0_LOCAL_DRAM*64)/time +Remote DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_1_REMOTE_DRAM*64 +Remote DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_1_REMOTE_DRAM*64)/time +Memory data volume [GByte] = 1.E-09*(OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64 +Memory bandwidth [MByte/s] = 1.E-06*((OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64)/time +-- +This performance group measures the data traffic of CPU cores to local and remote +memory. diff --git a/collectors/likwid/groups/haswellEP/PORT_USAGE.txt b/collectors/likwid/groups/haswellEP/PORT_USAGE.txt new file mode 100644 index 0000000..eb74ffe --- /dev/null +++ b/collectors/likwid/groups/haswellEP/PORT_USAGE.txt @@ -0,0 +1,46 @@ +SHORT Execution port utilization + +REQUIRE_NOHT + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_EXECUTED_PORT_PORT_0 +PMC1 UOPS_EXECUTED_PORT_PORT_1 +PMC2 UOPS_EXECUTED_PORT_PORT_2 +PMC3 UOPS_EXECUTED_PORT_PORT_3 +PMC4 UOPS_EXECUTED_PORT_PORT_4 +PMC5 UOPS_EXECUTED_PORT_PORT_5 +PMC6 UOPS_EXECUTED_PORT_PORT_6 +PMC7 UOPS_EXECUTED_PORT_PORT_7 + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Port0 usage ratio PMC0/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port1 usage ratio PMC1/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port2 usage ratio PMC2/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port3 usage ratio PMC3/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port4 usage ratio PMC4/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port5 usage ratio PMC5/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port6 usage ratio PMC6/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port7 usage ratio PMC7/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) + +LONG +Formulas: +Port0 usage ratio = UOPS_EXECUTED_PORT_PORT_0/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port1 usage ratio = UOPS_EXECUTED_PORT_PORT_1/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port2 usage ratio = UOPS_EXECUTED_PORT_PORT_2/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port3 usage ratio = UOPS_EXECUTED_PORT_PORT_3/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port4 usage ratio = UOPS_EXECUTED_PORT_PORT_4/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port5 usage ratio = UOPS_EXECUTED_PORT_PORT_5/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port6 usage ratio = UOPS_EXECUTED_PORT_PORT_6/SUM(UOPS_EXECUTED_PORT_PORT_*) +Port7 usage ratio = UOPS_EXECUTED_PORT_PORT_7/SUM(UOPS_EXECUTED_PORT_PORT_*) +- +This group measures the execution port utilization in a CPU core. The group can +only be measured when HyperThreading is disabled because only then each CPU core +can program eight counters. diff --git a/collectors/likwid/groups/haswellEP/QPI.txt b/collectors/likwid/groups/haswellEP/QPI.txt new file mode 100644 index 0000000..dcdda85 --- /dev/null +++ b/collectors/likwid/groups/haswellEP/QPI.txt @@ -0,0 +1,49 @@ +SHORT QPI Link Layer data + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +QBOX0C0 RXL_FLITS_G0_DATA +QBOX1C0 RXL_FLITS_G0_DATA +QBOX0C1 RXL_FLITS_G0_NON_DATA +QBOX1C1 RXL_FLITS_G0_NON_DATA +QBOX0C2 TXL_FLITS_G0_DATA +QBOX1C2 TXL_FLITS_G0_DATA +QBOX0C3 TXL_FLITS_G0_NON_DATA +QBOX1C3 TXL_FLITS_G0_NON_DATA + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +QPI send data volume [GByte] 1.E-09*(QBOX0C2+QBOX1C2)*8 +QPI send data bandwidth [MByte/s] 1.E-06*(QBOX0C2+QBOX1C2)*8/time +QPI send link volume [GByte] 1.E-09*(QBOX0C2+QBOX1C2+QBOX0C3+QBOX1C3)*8 +QPI send link bandwidth [MByte/s] 1.E-06*(QBOX0C2+QBOX1C2+QBOX0C3+QBOX1C3)*8/time +QPI receive data volume [GByte] 1.E-09*(QBOX0C0+QBOX1C0)*8 +QPI receive data bandwidth [MByte/s] 1.E-06*(QBOX0C0+QBOX1C0)*8/time +QPI receive link volume [GByte] 1.E-09*(QBOX0C0+QBOX1C0+QBOX0C1+QBOX1C1)*8 +QPI receive link bandwidth [MByte/s] 1.E-06*(QBOX0C0+QBOX1C0+QBOX0C1+QBOX1C1)*8/time +QPI total transfer volume [GByte] 1.E-09*(QBOX0C0+QBOX1C0+QBOX0C2+QBOX1C2+QBOX0C1+QBOX1C1+QBOX0C3+QBOX1C3)*8 +QPI total bandwidth [MByte/s] 1.E-06*(QBOX0C0+QBOX1C0+QBOX0C2+QBOX1C2+QBOX0C1+QBOX1C1+QBOX0C3+QBOX1C3)*8/time + +LONG +Formulas: +QPI send data volume [GByte] = 1.E-09*(sum(TXL_FLITS_G0_DATA)*8) +QPI send data bandwidth [MByte/s] = 1.E-06*(sum(TXL_FLITS_G0_DATA)*8)/runtime +QPI send link volume [GByte] = 1.E-09*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8) +QPI send link bandwidth [MByte/s] = 1.E-06*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8)/runtime +QPI receive data volume [GByte] = 1.E-09*(sum(RXL_FLITS_G0_DATA)*8) +QPI receive data bandwidth [MByte/s] = 1.E-06*(sum(RXL_FLITS_G0_DATA)*8)/runtime +QPI receive link volume [GByte] = 1.E-09*((sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8) +QPI receive link bandwidth [MByte/s] = 1.E-06*((sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8)/runtime +QPI total transfer volume [GByte] = 1.E-09*(sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA)+sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8 +QPI total bandwidth [MByte/s] = 1.E-06*(sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA)+sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8/time +-- +The Intel QPI Link Layer is responsible for packetizing requests from the caching agent (CBOXes) +on the way out to the system interface. For Haswell EP systems, the Link Layer and the +Ring interface is separated. The QPI link volume contains header, data and trailer while the +QPI data volume counts only the data flits. diff --git a/collectors/likwid/groups/haswellEP/RECOVERY.txt b/collectors/likwid/groups/haswellEP/RECOVERY.txt new file mode 100644 index 0000000..7928ee4 --- /dev/null +++ b/collectors/likwid/groups/haswellEP/RECOVERY.txt @@ -0,0 +1,22 @@ +SHORT Recovery duration + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 INT_MISC_RECOVERY_CYCLES +PMC1 INT_MISC_RECOVERY_COUNT + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Avg. recovery duration PMC0/PMC1 + +LONG +Formulas: +Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT +- +This group measures the duration of recoveries after SSE exception, memory +disambiguation, etc... diff --git a/collectors/likwid/groups/haswellEP/SBOX.txt b/collectors/likwid/groups/haswellEP/SBOX.txt new file mode 100644 index 0000000..24f86b6 --- /dev/null +++ b/collectors/likwid/groups/haswellEP/SBOX.txt @@ -0,0 +1,28 @@ +SHORT Ring Transfer bandwidth + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +SBOX0C0 RING_BL_USED_ANY +SBOX1C0 RING_BL_USED_ANY +SBOX2C0 RING_BL_USED_ANY +SBOX3C0 RING_BL_USED_ANY + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Ring transfer bandwidth [MByte/s] 1.E-06*(SBOX0C0+SBOX1C0+SBOX2C0+SBOX3C0)*32/time +Ring transfer data volume [GByte] 1.E-09*(SBOX0C0+SBOX1C0+SBOX2C0+SBOX3C0)*32 + +LONG +Formulas: +Ring transfer bandwidth [MByte/s] = 1.E-06*(SUM(SBOXxC0)*32)/time +Ring transfer data volume [GByte] = 1.E-09*(SUM(SBOXxC0)*32) +-- +The SBOXes manage the transfer between the socket local ring(s). For micro architectures +prior to Haswell, the SBOX and QBOX was similar as only a single ring was used. +Haswell systems with a high core count assemble two rings that are connected through +the SBOXes, the traffic between the sockets is handled by the QBOXes. diff --git a/collectors/likwid/groups/haswellEP/TLB_DATA.txt b/collectors/likwid/groups/haswellEP/TLB_DATA.txt new file mode 100644 index 0000000..8d94e05 --- /dev/null +++ b/collectors/likwid/groups/haswellEP/TLB_DATA.txt @@ -0,0 +1,35 @@ +SHORT L2 data TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 DTLB_LOAD_MISSES_CAUSES_A_WALK +PMC1 DTLB_STORE_MISSES_CAUSES_A_WALK +PMC2 DTLB_LOAD_MISSES_WALK_DURATION +PMC3 DTLB_STORE_MISSES_WALK_DURATION + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 DTLB load misses PMC0 +L1 DTLB load miss rate PMC0/FIXC0 +L1 DTLB load miss duration [Cyc] PMC2/PMC0 +L1 DTLB store misses PMC1 +L1 DTLB store miss rate PMC1/FIXC0 +L1 DTLB store miss duration [Cyc] PMC3/PMC1 + +LONG +Formulas: +L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK +L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK +L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK +L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK +- +The DTLB load and store miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/haswellEP/TLB_INSTR.txt b/collectors/likwid/groups/haswellEP/TLB_INSTR.txt new file mode 100644 index 0000000..235d977 --- /dev/null +++ b/collectors/likwid/groups/haswellEP/TLB_INSTR.txt @@ -0,0 +1,28 @@ +SHORT L1 Instruction TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ITLB_MISSES_CAUSES_A_WALK +PMC1 ITLB_MISSES_WALK_DURATION + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 ITLB misses PMC0 +L1 ITLB miss rate PMC0/FIXC0 +L1 ITLB miss duration [Cyc] PMC1/PMC0 + + +LONG +Formulas: +L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK +L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK +- +The ITLB miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/haswellEP/TMA.txt b/collectors/likwid/groups/haswellEP/TMA.txt new file mode 100644 index 0000000..afb4126 --- /dev/null +++ b/collectors/likwid/groups/haswellEP/TMA.txt @@ -0,0 +1,48 @@ +SHORT Top down cycle allocation + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_ANY +PMC1 UOPS_RETIRED_RETIRE_SLOTS +PMC2 IDQ_UOPS_NOT_DELIVERED_CORE +PMC3 INT_MISC_RECOVERY_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +IPC FIXC0/FIXC1 +Total Slots 4*FIXC1 +Slots Retired PMC1 +Fetch Bubbles PMC2 +Recovery Bubbles 4*PMC3 +Front End [%] PMC2/(4*FIXC1)*100 +Speculation [%] (PMC0-PMC1+(4*PMC3))/(4*FIXC1)*100 +Retiring [%] PMC1/(4*FIXC1)*100 +Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100 + +LONG +Formulas: +Total Slots = 4*CPU_CLK_UNHALTED_CORE +Slots Retired = UOPS_RETIRED_RETIRE_SLOTS +Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE +Recovery Bubbles = 4*INT_MISC_RECOVERY_CYCLES +Front End [%] = IDQ_UOPS_NOT_DELIVERED_CORE/(4*CPU_CLK_UNHALTED_CORE)*100 +Speculation [%] = (UOPS_ISSUED_ANY-UOPS_RETIRED_RETIRE_SLOTS+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)*100 +Retiring [%] = UOPS_RETIRED_RETIRE_SLOTS/(4*CPU_CLK_UNHALTED_CORE)*100 +Back End [%] = (1-((IDQ_UOPS_NOT_DELIVERED_CORE+UOPS_ISSUED_ANY+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)))*100 +-- +This performance group measures cycles to determine percentage of time spent in +front end, back end, retiring and speculation. These metrics are published and +verified by Intel. Further information: +Webpage describing Top-Down Method and its usage in Intel vTune: +https://software.intel.com/en-us/vtune-amplifier-help-tuning-applications-using-a-top-down-microarchitecture-analysis-method +Paper by Yasin Ahmad: +https://sites.google.com/site/analysismethods/yasin-pubs/TopDown-Yasin-ISPASS14.pdf?attredirects=0 +Slides by Yasin Ahmad: +http://www.cs.technion.ac.il/~erangi/TMA_using_Linux_perf__Ahmad_Yasin.pdf +The performance group was originally published here: +http://perf.mvermeulen.com/2018/04/14/top-down-performance-counter-analysis-part-1-likwid/ diff --git a/collectors/likwid/groups/haswellEP/UOPS.txt b/collectors/likwid/groups/haswellEP/UOPS.txt new file mode 100644 index 0000000..e6cc208 --- /dev/null +++ b/collectors/likwid/groups/haswellEP/UOPS.txt @@ -0,0 +1,35 @@ +SHORT UOPs execution info + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_ANY +PMC1 UOPS_EXECUTED_THREAD +PMC2 UOPS_RETIRED_ALL +PMC3 UOPS_ISSUED_FLAGS_MERGE + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Issued UOPs PMC0 +Merged UOPs PMC3 +Executed UOPs PMC1 +Retired UOPs PMC2 + +LONG +Formulas: +Issued UOPs = UOPS_ISSUED_ANY +Merged UOPs = UOPS_ISSUED_FLAGS_MERGE +Executed UOPs = UOPS_EXECUTED_THREAD +Retired UOPs = UOPS_RETIRED_ALL +- +This group returns information about the instruction pipeline. It measures the +issued, executed and retired uOPs and returns the number of uOPs which were issued +but not executed as well as the number of uOPs which were executed but never retired. +The executed but not retired uOPs commonly come from speculatively executed branches. + diff --git a/collectors/likwid/groups/haswellEP/UOPS_EXEC.txt b/collectors/likwid/groups/haswellEP/UOPS_EXEC.txt new file mode 100644 index 0000000..7042df7 --- /dev/null +++ b/collectors/likwid/groups/haswellEP/UOPS_EXEC.txt @@ -0,0 +1,31 @@ +SHORT UOPs execution + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_EXECUTED_USED_CYCLES +PMC1 UOPS_EXECUTED_STALL_CYCLES +PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES +PMC3:EDGEDETECT UOPS_EXECUTED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio [%] 100*PMC0/PMC2 +Unused cycles ratio [%] 100*PMC1/PMC2 +Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT + + +LONG +Formulas: +Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT +- +This performance group returns the ratios of used and unused cycles regarding +the execution stage in the pipeline. Used cycles are all cycles where uOPs are +executed while unused cycles refer to pipeline stalls. Moreover, the group +calculates the average stall duration in cycles. diff --git a/collectors/likwid/groups/haswellEP/UOPS_ISSUE.txt b/collectors/likwid/groups/haswellEP/UOPS_ISSUE.txt new file mode 100644 index 0000000..9aac923 --- /dev/null +++ b/collectors/likwid/groups/haswellEP/UOPS_ISSUE.txt @@ -0,0 +1,31 @@ +SHORT UOPs issueing + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_USED_CYCLES +PMC1 UOPS_ISSUED_STALL_CYCLES +PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES +PMC3:EDGEDETECT UOPS_ISSUED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio [%] 100*PMC0/PMC2 +Unused cycles ratio [%] 100*PMC1/PMC2 +Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT + + +LONG +Formulas: +Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT +- +This performance group returns the ratios of used and unused cycles regarding +the issue stage in the pipeline. Used cycles are all cycles where uOPs are +issued while unused cycles refer to pipeline stalls. Moreover, the group +calculates the average stall duration in cycles. diff --git a/collectors/likwid/groups/haswellEP/UOPS_RETIRE.txt b/collectors/likwid/groups/haswellEP/UOPS_RETIRE.txt new file mode 100644 index 0000000..0f37585 --- /dev/null +++ b/collectors/likwid/groups/haswellEP/UOPS_RETIRE.txt @@ -0,0 +1,31 @@ +SHORT UOPs retirement + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_RETIRED_USED_CYCLES +PMC1 UOPS_RETIRED_STALL_CYCLES +PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES +PMC3:EDGEDETECT UOPS_RETIRED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio [%] 100*PMC0/PMC2 +Unused cycles ratio [%] 100*PMC1/PMC2 +Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT + + +LONG +Formulas: +Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT +- +This performance group returns the ratios of used and unused cycles regarding +the retirement stage in the pipeline (re-order buffer). Used cycles are all +cycles where uOPs are retired while unused cycles refer to pipeline stalls. +Moreover, the group calculates the average stall duration in cycles. diff --git a/collectors/likwid/groups/interlagos/BRANCH.txt b/collectors/likwid/groups/interlagos/BRANCH.txt new file mode 100644 index 0000000..7495b74 --- /dev/null +++ b/collectors/likwid/groups/interlagos/BRANCH.txt @@ -0,0 +1,26 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +PMC0 RETIRED_INSTRUCTIONS +PMC1 RETIRED_BRANCH_INSTR +PMC2 RETIRED_MISPREDICTED_BRANCH_INSTR + +METRICS +Runtime (RDTSC) [s] time +Branch rate PMC1/PMC0 +Branch misprediction rate PMC2/PMC0 +Branch misprediction ratio PMC2/PMC1 +Instructions per branch PMC0/PMC1 + +LONG +Formulas: +Branch rate = RETIRED_BRANCH_INSTR/RETIRED_INSTRUCTIONS +Branch misprediction rate = RETIRED_MISPREDICTED_BRANCH_INSTR/RETIRED_INSTRUCTIONS +Branch misprediction ratio = RETIRED_MISPREDICTED_BRANCH_INSTR/RETIRED_BRANCH_INSTR +Instructions per branch = RETIRED_INSTRUCTIONS/RETIRED_BRANCH_INSTR +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/interlagos/CACHE.txt b/collectors/likwid/groups/interlagos/CACHE.txt new file mode 100644 index 0000000..0d785fc --- /dev/null +++ b/collectors/likwid/groups/interlagos/CACHE.txt @@ -0,0 +1,32 @@ +SHORT Data cache miss rate/ratio + +EVENTSET +PMC0 RETIRED_INSTRUCTIONS +PMC1 DATA_CACHE_ACCESSES +PMC2 DATA_CACHE_REFILLS_VALID +PMC3 DATA_CACHE_MISSES_ALL + +METRICS +Runtime (RDTSC) [s] time +data cache misses PMC3 +data cache request rate PMC1/PMC0 +data cache miss rate (PMC2)/PMC0 +data cache miss ratio (PMC2)/PMC1 + +LONG +Formulas: +data cache misses = DATA_CACHE_MISSES_ALL +data cache request rate = DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS +data cache miss rate = (DATA_CACHE_REFILLS_VALID) / RETIRED_INSTRUCTIONS +data cache miss ratio = (DATA_CACHE_REFILLS_VALID)/DATA_CACHE_ACCESSES +- +This group measures the locality of your data accesses with regard to the +L1 cache. Data cache request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The data cache miss rate gives a measure how often it was necessary to get +cache lines from higher levels of the memory hierarchy. And finally +data cache miss ratio tells you how many of your memory references required +a cache line to be loaded from a higher level. While the# data cache miss rate +might be given by your algorithm you should try to get data cache miss ratio +as low as possible by increasing your cache reuse. + diff --git a/collectors/likwid/groups/interlagos/CPI.txt b/collectors/likwid/groups/interlagos/CPI.txt new file mode 100644 index 0000000..c0746e7 --- /dev/null +++ b/collectors/likwid/groups/interlagos/CPI.txt @@ -0,0 +1,26 @@ +SHORT Cycles per instruction + +EVENTSET +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +PMC2 RETIRED_UOPS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] PMC1*inverseClock +CPI PMC1/PMC0 +CPI (based on uops) PMC1/PMC2 +IPC PMC0/PMC1 + +LONG +Formulas: +CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS +CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS +IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED +- +This group measures how efficient the processor works with +regard to instruction throughput. Also important as a standalone +metric is RETIRED_INSTRUCTIONS as it tells you how many instruction +you need to execute for a task. An optimization might show very +low CPI values but execute many more instruction for it. + diff --git a/collectors/likwid/groups/interlagos/DATA.txt b/collectors/likwid/groups/interlagos/DATA.txt new file mode 100644 index 0000000..75f1f60 --- /dev/null +++ b/collectors/likwid/groups/interlagos/DATA.txt @@ -0,0 +1,16 @@ +SHORT Load to store ratio + +EVENTSET +PMC0 LS_DISPATCH_LOADS +PMC1 LS_DISPATCH_STORES + +METRICS +Runtime (RDTSC) [s] time +Load to store ratio PMC0/PMC1 + +LONG +Formulas: +Load to store ratio = LS_DISPATCH_LOADS/LS_DISPATCH_STORES +- +This is a simple metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/interlagos/FLOPS_DP.txt b/collectors/likwid/groups/interlagos/FLOPS_DP.txt new file mode 100644 index 0000000..7af248c --- /dev/null +++ b/collectors/likwid/groups/interlagos/FLOPS_DP.txt @@ -0,0 +1,23 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +PMC2 RETIRED_UOPS +PMC3 RETIRED_FLOPS_DOUBLE_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] PMC1*inverseClock +DP [MFLOP/s] 1.0E-06*(PMC3)/time +CPI PMC1/PMC0 +CPI (based on uops) PMC1/PMC2 +IPC PMC0/PMC1 + +LONG +Formulas: +DP [MFLOP/s] = 1.0E-06*(RETIRED_FLOPS_DOUBLE_ALL)/time +- +Profiling group to measure double precisision FLOP rate. + + diff --git a/collectors/likwid/groups/interlagos/FLOPS_SP.txt b/collectors/likwid/groups/interlagos/FLOPS_SP.txt new file mode 100644 index 0000000..14af2c2 --- /dev/null +++ b/collectors/likwid/groups/interlagos/FLOPS_SP.txt @@ -0,0 +1,23 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +PMC2 RETIRED_UOPS +PMC3 RETIRED_FLOPS_SINGLE_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] PMC1*inverseClock +SP [MFLOP/s] 1.0E-06*(PMC3)/time +CPI PMC1/PMC0 +CPI (based on uops) PMC1/PMC2 +IPC PMC0/PMC1 + +LONG +Formulas: +SP [MFLOP/s] = 1.0E-06*(RETIRED_FLOPS_SINGLE_ALL)/time +- +Profiling group to measure single precision FLOP rate. + + diff --git a/collectors/likwid/groups/interlagos/FPU_EXCEPTION.txt b/collectors/likwid/groups/interlagos/FPU_EXCEPTION.txt new file mode 100644 index 0000000..0969ae1 --- /dev/null +++ b/collectors/likwid/groups/interlagos/FPU_EXCEPTION.txt @@ -0,0 +1,21 @@ +SHORT Floating point exceptions + +EVENTSET +PMC0 RETIRED_INSTRUCTIONS +PMC1 RETIRED_FP_INSTRUCTIONS_ALL +PMC2 FPU_EXCEPTION_ALL + +METRICS +Runtime (RDTSC) [s] time +Overall FP exception rate PMC2/PMC0 +FP exception rate PMC2/PMC1 + +LONG +Formulas: +Overall FP exception rate = FPU_EXCEPTIONS_ALL / INSTRUCTIONS_RETIRED +FP exception rate = FPU_EXCEPTIONS_ALL / FP_INSTRUCTIONS_RETIRED_ALL +- +Floating point exceptions occur e.g. on the treatment of denormal numbers. +There might be a large penalty if there are too many floating point +exceptions. + diff --git a/collectors/likwid/groups/interlagos/ICACHE.txt b/collectors/likwid/groups/interlagos/ICACHE.txt new file mode 100644 index 0000000..62b91d6 --- /dev/null +++ b/collectors/likwid/groups/interlagos/ICACHE.txt @@ -0,0 +1,23 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +PMC0 INSTRUCTION_CACHE_FETCHES +PMC1 INSTRUCTION_CACHE_L2_REFILLS +PMC2 INSTRUCTION_CACHE_SYSTEM_REFILLS +PMC3 RETIRED_INSTRUCTIONS + +METRICS +Runtime (RDTSC) [s] time +L1I request rate PMC0/PMC3 +L1I miss rate (PMC1+PMC2)/PMC3 +L1I miss ratio (PMC1+PMC2)/PMC0 + +LONG +Formulas: +L1I request rate = INSTRUCTION_CACHE_FETCHES / RETIRED_INSTRUCTIONS +L1I miss rate = (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/RETIRED_INSTRUCTIONS +L1I miss ratio = (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/INSTRUCTION_CACHE_FETCHES +- +This group measures the locality of your instruction code with regard to the +L1 I-Cache. + diff --git a/collectors/likwid/groups/interlagos/L2.txt b/collectors/likwid/groups/interlagos/L2.txt new file mode 100644 index 0000000..4d90ef8 --- /dev/null +++ b/collectors/likwid/groups/interlagos/L2.txt @@ -0,0 +1,29 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +PMC0 DATA_CACHE_REFILLS_ALL +PMC1 DATA_CACHE_REFILLS_SYSTEM +PMC2 CPU_CLOCKS_UNHALTED + +METRICS +Runtime (RDTSC) [s] time +L2 bandwidth [MBytes/s] 1.0E-06*(PMC0-PMC1)*64.0/time +L2 data volume [GBytes] 1.0E-09*(PMC0-PMC1)*64.0 +Cache refill bandwidth System/L2 [MBytes/s] 1.0E-06*PMC0*64.0/time +Cache refill bandwidth System [MBytes/s] 1.0E-06*PMC1*64.0/time + +LONG +Formulas: +L2 bandwidth [MBytes/s] = 1.0E-06*(DATA_CACHE_REFILLS_ALL-DATA_CACHE_REFILLS_SYSTEM)*64/time +L2 data volume [GBytes] = 1.0E-09*(DATA_CACHE_REFILLS_ALL-DATA_CACHE_REFILLS_SYSTEM)*64 +Cache refill bandwidth system/L2 [MBytes/s] = 1.0E-06*DATA_CACHE_REFILLS_ALL*64/time +Cache refill bandwidth system [MBytes/s] = 1.0E-06*DATA_CACHE_REFILLS_SYSTEM*64/time +- +Profiling group to measure L2 cache bandwidth. The bandwidth is +computed by the number of cache line loaded from L2 to L1 and the +number of modified cache lines evicted from the L1. +Note that this bandwidth also included data transfers due to a +write allocate load on a store miss in L1 and copy back transfers if +originated from L2. L2-L2 data volume is the total data volume transferred +between L2 and L1. + diff --git a/collectors/likwid/groups/interlagos/L2CACHE.txt b/collectors/likwid/groups/interlagos/L2CACHE.txt new file mode 100644 index 0000000..49b9555 --- /dev/null +++ b/collectors/likwid/groups/interlagos/L2CACHE.txt @@ -0,0 +1,31 @@ +SHORT L2 cache miss rate/ratio + +EVENTSET +PMC0 RETIRED_INSTRUCTIONS +PMC1 REQUESTS_TO_L2_DC_FILL +PMC2 L2_CACHE_MISS_DC_FILL + +METRICS +Runtime (RDTSC) [s] time +L2 request rate PMC1/PMC0 +L2 miss rate PMC2/PMC0 +L2 miss ratio PMC2/PMC1 + +LONG +Formulas: +L2 request rate = L2_REQUESTS_ALL/INSTRUCTIONS_RETIRED +L2 miss rate = L2_MISSES_ALL/INSTRUCTIONS_RETIRED +L2 miss ratio = L2_MISSES_ALL/L2_REQUESTS_ALL +- +This group measures the locality of your data accesses with regard to the L2 +Cache. L2 request rate tells you how data intensive your code is or how many +data accesses you have on average per instruction. The L2 miss rate gives a +measure how often it was necessary to get cache lines from memory. And finally +L2 miss ratio tells you how many of your memory references required a cache line +to be loaded from a higher level. While the# data cache miss rate might be +given by your algorithm you should try to get data cache miss ratio as low as +possible by increasing your cache reuse. This group is inspired from the +whitepaper -Basic Performance Measurements for AMD Athlon 64, AMD Opteron and +AMD Phenom Processors- from Paul J. Drongowski. + + diff --git a/collectors/likwid/groups/interlagos/L3.txt b/collectors/likwid/groups/interlagos/L3.txt new file mode 100644 index 0000000..5c9ea4d --- /dev/null +++ b/collectors/likwid/groups/interlagos/L3.txt @@ -0,0 +1,29 @@ +SHORT L3 cache bandwidth in MBytes/s + +EVENTSET +PMC0 L2_FILL_WB_FILL +PMC1 L2_FILL_WB_WB +PMC2 CPU_CLOCKS_UNHALTED + +METRICS +Runtime (RDTSC) [s] time +L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L3 load data volume [GBytes] 1.0E-09*PMC0*64.0 +L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 + +LONG +Formulas: +L3 load bandwidth [MBytes/s] = 1.0E-06*L2_FILL_WB_FILL*64.0/time +L3 load data volume [GBytes] = 1.0E-09*L2_FILL_WB_FILL*64.0 +L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_FILL_WB_WB*64.0/time +L3 evict data volume [GBytes] = 1.0E-09*L2_FILL_WB_WB*64.0 +L3 bandwidth [MBytes/s] = 1.0E-06*(L2_FILL_WB_FILL+L2_FILL_WB_WB)*64/time +L3 data volume [GBytes] = 1.0E-09*(L2_FILL_WB_FILL+L2_FILL_WB_WB)*64 +- +Profiling group to measure L3 cache bandwidth. The bandwidth is +computed by the number of cache line loaded from L3 to L2 and the +number of modified cache lines evicted from the L2. + diff --git a/collectors/likwid/groups/interlagos/L3CACHE.txt b/collectors/likwid/groups/interlagos/L3CACHE.txt new file mode 100644 index 0000000..5a442c6 --- /dev/null +++ b/collectors/likwid/groups/interlagos/L3CACHE.txt @@ -0,0 +1,35 @@ +SHORT L3 cache miss rate/ratio + +EVENTSET +PMC0 RETIRED_INSTRUCTIONS +UPMC0 UNC_READ_REQ_TO_L3_ALL +UPMC1 UNC_L3_CACHE_MISS_ALL +UPMC2 UNC_L3_LATENCY_CYCLE_COUNT +UPMC3 UNC_L3_LATENCY_REQUEST_COUNT + +METRICS +Runtime (RDTSC) [s] time +L3 request rate UPMC0/PMC0 +L3 miss rate UPMC1/PMC0 +L3 miss ratio UPMC1/UPMC0 +L3 average access latency [cycles] UPMC2/UPMC3 + +LONG +Formulas: +L3 request rate = UNC_READ_REQ_TO_L3_ALL/INSTRUCTIONS_RETIRED +L3 miss rate = UNC_L3_CACHE_MISS_ALL/INSTRUCTIONS_RETIRED +L3 miss ratio = UNC_L3_CACHE_MISS_ALL/UNC_READ_REQ_TO_L3_ALL +L3 average access latency = UNC_L3_LATENCY_CYCLE_COUNT/UNC_L3_LATENCY_REQUEST_COUNT +- +This group measures the locality of your data accesses with regard to the L3 +Cache. L3 request rate tells you how data intensive your code is or how many +data accesses you have on average per instruction. The L3 miss rate gives a +measure how often it was necessary to get cache lines from memory. And finally +L3 miss ratio tells you how many of your memory references required a cache line +to be loaded from a higher level. While the# data cache miss rate might be +given by your algorithm you should try to get data cache miss ratio as low as +possible by increasing your cache reuse. This group was inspired from the +whitepaper - Basic Performance Measurements for AMD Athlon 64, AMD Opteron and +AMD Phenom Processors - from Paul J. Drongowski. + + diff --git a/collectors/likwid/groups/interlagos/LINKS.txt b/collectors/likwid/groups/interlagos/LINKS.txt new file mode 100644 index 0000000..dbf3cd0 --- /dev/null +++ b/collectors/likwid/groups/interlagos/LINKS.txt @@ -0,0 +1,26 @@ +SHORT Bandwidth on the Hypertransport links + +EVENTSET +UPMC0 UNC_LINK_TRANSMIT_BW_L0_USE +UPMC1 UNC_LINK_TRANSMIT_BW_L1_USE +UPMC2 UNC_LINK_TRANSMIT_BW_L2_USE +UPMC3 UNC_LINK_TRANSMIT_BW_L3_USE + +METRICS +Runtime (RDTSC) [s] time +Link bandwidth L0 [MBytes/s] 1.0E-06*UPMC0*4.0/time +Link bandwidth L1 [MBytes/s] 1.0E-06*UPMC1*4.0/time +Link bandwidth L2 [MBytes/s] 1.0E-06*UPMC2*4.0/time +Link bandwidth L3 [MBytes/s] 1.0E-06*UPMC3*4.0/time + +LONG +Formulas: +Link bandwidth L0 [MBytes/s] = 1.0E-06*UNC_LINK_TRANSMIT_BW_L0_USE*4.0/time +Link bandwidth L1 [MBytes/s] = 1.0E-06*UNC_LINK_TRANSMIT_BW_L1_USE*4.0/time +Link bandwidth L2 [MBytes/s] = 1.0E-06*UNC_LINK_TRANSMIT_BW_L2_USE*4.0/time +Link bandwidth L3 [MBytes/s] = 1.0E-06*UNC_LINK_TRANSMIT_BW_L3_USE*4.0/time +- +Profiling group to measure the HyperTransport link bandwidth for the four links +of a local node. This indicates the# data flow between different ccNUMA nodes. + + diff --git a/collectors/likwid/groups/interlagos/MEM.txt b/collectors/likwid/groups/interlagos/MEM.txt new file mode 100644 index 0000000..2fa9dfe --- /dev/null +++ b/collectors/likwid/groups/interlagos/MEM.txt @@ -0,0 +1,20 @@ +SHORT Main memory bandwidth in MBytes/s + +EVENTSET +UPMC0 UNC_DRAM_ACCESSES_DCT0_ALL +UPMC1 UNC_DRAM_ACCESSES_DCT1_ALL + +METRICS +Runtime (RDTSC) [s] time +Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64.0 + +LONG +Formulas: +Memory bandwidth [MBytes/s] = 1.0E-06*(DRAM_ACCESSES_DCTO_ALL+DRAM_ACCESSES_DCT1_ALL)*64/time +Memory data volume [GBytes] = 1.0E-09*(DRAM_ACCESSES_DCTO_ALL+DRAM_ACCESSES_DCT1_ALL)*64 +- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Note: As this group measures the accesses from all cores it only makes sense +to measure with one core per socket, similar as with the Intel Nehalem Uncore events. + diff --git a/collectors/likwid/groups/interlagos/NUMA.txt b/collectors/likwid/groups/interlagos/NUMA.txt new file mode 100644 index 0000000..79f3618 --- /dev/null +++ b/collectors/likwid/groups/interlagos/NUMA.txt @@ -0,0 +1,28 @@ +SHORT Read/Write Events between the ccNUMA nodes + +EVENTSET +UPMC0 UNC_CPU_TO_DRAM_LOCAL_TO_0 +UPMC1 UNC_CPU_TO_DRAM_LOCAL_TO_1 +UPMC2 UNC_CPU_TO_DRAM_LOCAL_TO_2 +UPMC3 UNC_CPU_TO_DRAM_LOCAL_TO_3 + +METRICS +Runtime (RDTSC) [s] time +DRAM read/write local to 0 [MegaEvents/s] 1.0E-06*UPMC0/time +DRAM read/write local to 1 [MegaEvents/s] 1.0E-06*UPMC1/time +DRAM read/write local to 2 [MegaEvents/s] 1.0E-06*UPMC2/time +DRAM read/write local to 3 [MegaEvents/s] 1.0E-06*UPMC3/time + +LONG +Formulas: +DRAM read/write local to 0 [MegaEvents/s] = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_0/time +DRAM read/write local to 1 [MegaEvents/s] = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_1/time +DRAM read/write local to 2 [MegaEvents/s] = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_2/time +DRAM read/write local to 3 [MegaEvents/s] = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_3/time +- +Profiling group to measure the traffic from local CPU to the different +DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded +code. You must first determine on which memory domains your code is running. +A code should only have significant traffic to its own memory domain. + + diff --git a/collectors/likwid/groups/interlagos/NUMA_0_3.txt b/collectors/likwid/groups/interlagos/NUMA_0_3.txt new file mode 100644 index 0000000..79f3618 --- /dev/null +++ b/collectors/likwid/groups/interlagos/NUMA_0_3.txt @@ -0,0 +1,28 @@ +SHORT Read/Write Events between the ccNUMA nodes + +EVENTSET +UPMC0 UNC_CPU_TO_DRAM_LOCAL_TO_0 +UPMC1 UNC_CPU_TO_DRAM_LOCAL_TO_1 +UPMC2 UNC_CPU_TO_DRAM_LOCAL_TO_2 +UPMC3 UNC_CPU_TO_DRAM_LOCAL_TO_3 + +METRICS +Runtime (RDTSC) [s] time +DRAM read/write local to 0 [MegaEvents/s] 1.0E-06*UPMC0/time +DRAM read/write local to 1 [MegaEvents/s] 1.0E-06*UPMC1/time +DRAM read/write local to 2 [MegaEvents/s] 1.0E-06*UPMC2/time +DRAM read/write local to 3 [MegaEvents/s] 1.0E-06*UPMC3/time + +LONG +Formulas: +DRAM read/write local to 0 [MegaEvents/s] = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_0/time +DRAM read/write local to 1 [MegaEvents/s] = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_1/time +DRAM read/write local to 2 [MegaEvents/s] = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_2/time +DRAM read/write local to 3 [MegaEvents/s] = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_3/time +- +Profiling group to measure the traffic from local CPU to the different +DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded +code. You must first determine on which memory domains your code is running. +A code should only have significant traffic to its own memory domain. + + diff --git a/collectors/likwid/groups/interlagos/NUMA_4_7.txt b/collectors/likwid/groups/interlagos/NUMA_4_7.txt new file mode 100644 index 0000000..0e05776 --- /dev/null +++ b/collectors/likwid/groups/interlagos/NUMA_4_7.txt @@ -0,0 +1,28 @@ +SHORT Read/Write Events between the ccNUMA nodes + +EVENTSET +UPMC0 UNC_CPU_TO_DRAM_LOCAL_TO_4 +UPMC1 UNC_CPU_TO_DRAM_LOCAL_TO_5 +UPMC2 UNC_CPU_TO_DRAM_LOCAL_TO_6 +UPMC3 UNC_CPU_TO_DRAM_LOCAL_TO_7 + +METRICS +Runtime (RDTSC) [s] time +DRAM read/write local to 4 [MegaEvents/s] 1.0E-06*UPMC0/time +DRAM read/write local to 5 [MegaEvents/s] 1.0E-06*UPMC1/time +DRAM read/write local to 6 [MegaEvents/s] 1.0E-06*UPMC2/time +DRAM read/write local to 7 [MegaEvents/s] 1.0E-06*UPMC3/time + +LONG +Formulas: +DRAM read/write local to 4 [MegaEvents/s] = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_0/time +DRAM read/write local to 5 [MegaEvents/s] = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_1/time +DRAM read/write local to 6 [MegaEvents/s] = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_2/time +DRAM read/write local to 7 [MegaEvents/s] = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_3/time +- +Profiling group to measure the traffic from local CPU to the different +DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded +code. You must first determine on which memory domains your code is running. +A code should only have significant traffic to its own memory domain. + + diff --git a/collectors/likwid/groups/ivybridge/BRANCH.txt b/collectors/likwid/groups/ivybridge/BRANCH.txt new file mode 100644 index 0000000..b8d41b2 --- /dev/null +++ b/collectors/likwid/groups/ivybridge/BRANCH.txt @@ -0,0 +1,31 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 BR_INST_RETIRED_ALL_BRANCHES +PMC1 BR_MISP_RETIRED_ALL_BRANCHES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Branch rate PMC0/FIXC0 +Branch misprediction rate PMC1/FIXC0 +Branch misprediction ratio PMC1/PMC0 +Instructions per branch FIXC0/PMC0 + +LONG +Formulas: +Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES +Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/ivybridge/CLOCK.txt b/collectors/likwid/groups/ivybridge/CLOCK.txt new file mode 100644 index 0000000..fb19101 --- /dev/null +++ b/collectors/likwid/groups/ivybridge/CLOCK.txt @@ -0,0 +1,26 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY +UBOXFIX UNCORE_CLOCK + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +Uncore Clock [MHz] 1.E-06*UBOXFIX/time +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time +- +IvyBridge implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) level. + diff --git a/collectors/likwid/groups/ivybridge/CYCLE_ACTIVITY.txt b/collectors/likwid/groups/ivybridge/CYCLE_ACTIVITY.txt new file mode 100644 index 0000000..c432a44 --- /dev/null +++ b/collectors/likwid/groups/ivybridge/CYCLE_ACTIVITY.txt @@ -0,0 +1,38 @@ +SHORT Cycle Activities + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING +PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING +PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING +PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Cycles without execution [%] (PMC3/FIXC1)*100 +Cycles without execution due to L1D [%] (PMC2/FIXC1)*100 +Cycles without execution due to L2 [%] (PMC0/FIXC1)*100 +Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100 + +LONG +Formulas: +Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100 +Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100 +Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100 +Cycles without execution due to memory loads [%] = CYCLE_ACTIVITY_CYCLES_LDM_PENDING/CPU_CLK_UNHALTED_CORE*100 +-- +This performance group measures the cycles while waiting for data from the cache +and memory hierarchy. +CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on +any execution port. +CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is +outstanding. +CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is +outstanding. +CYCLE_ACTIVITY_CYCLES_LDM_PENDING: Cycles while memory subsystem has an +outstanding load. diff --git a/collectors/likwid/groups/ivybridge/CYCLE_STALLS.txt b/collectors/likwid/groups/ivybridge/CYCLE_STALLS.txt new file mode 100644 index 0000000..795aeb9 --- /dev/null +++ b/collectors/likwid/groups/ivybridge/CYCLE_STALLS.txt @@ -0,0 +1,45 @@ +SHORT Cycle Activities (Stalls) + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING +PMC1 CYCLE_ACTIVITY_STALLS_LDM_PENDING +PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING +PMC3 CYCLE_ACTIVITY_STALLS_TOTAL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Total execution stalls PMC3 +Stalls caused by L1D misses [%] (PMC2/PMC3)*100 +Stalls caused by L2 misses [%] (PMC0/PMC3)*100 +Stalls caused by memory loads [%] (PMC1/PMC3)*100 +Execution stall rate [%] (PMC3/FIXC1)*100 +Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100 +Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100 +Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100 + +LONG +Formulas: +Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL +Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Stalls caused by memory loads [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by memory loads rate [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CPU_CLK_UNHALTED_CORE)*100 +-- +This performance group measures the stalls caused by data traffic in the cache +hierarchy. +CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls. +CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand +load is outstanding. +CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand +load is outstanding. +CYCLE_ACTIVITY_STALLS_LDM_PENDING: Execution stalls while memory subsystem has +an outstanding load. diff --git a/collectors/likwid/groups/ivybridge/DATA.txt b/collectors/likwid/groups/ivybridge/DATA.txt new file mode 100644 index 0000000..967cbad --- /dev/null +++ b/collectors/likwid/groups/ivybridge/DATA.txt @@ -0,0 +1,22 @@ +SHORT Load to store ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_UOPS_RETIRED_LOADS +PMC1 MEM_UOPS_RETIRED_STORES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Load to store ratio PMC0/PMC1 + +LONG +Formulas: +Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES +- +This is a metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/ivybridge/DIVIDE.txt b/collectors/likwid/groups/ivybridge/DIVIDE.txt new file mode 100644 index 0000000..f8cb0b3 --- /dev/null +++ b/collectors/likwid/groups/ivybridge/DIVIDE.txt @@ -0,0 +1,24 @@ +SHORT Divide unit information + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ARITH_NUM_DIV +PMC1 ARITH_FPU_DIV_ACTIVE + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Number of divide ops PMC0 +Avg. divide unit usage duration PMC1/PMC0 + +LONG +Formulas: +Number of divide ops = ARITH_NUM_DIV +Avg. divide unit usage duration = ARITH_FPU_DIV_ACTIVE/ARITH_NUM_DIV +- +This performance group measures the average latency of divide operations diff --git a/collectors/likwid/groups/ivybridge/ENERGY.txt b/collectors/likwid/groups/ivybridge/ENERGY.txt new file mode 100644 index 0000000..92a6915 --- /dev/null +++ b/collectors/likwid/groups/ivybridge/ENERGY.txt @@ -0,0 +1,37 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +TMP0 TEMP_CORE +PWR0 PWR_PKG_ENERGY +PWR1 PWR_PP0_ENERGY +PWR2 PWR_PP1_ENERGY +PWR3 PWR_DRAM_ENERGY + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Temperature [C] TMP0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy PP0 [J] PWR1 +Power PP0 [W] PWR1/time +Energy PP1 [J] PWR2 +Power PP1 [W] PWR2/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Power PP0 = PWR_PP0_ENERGY / time +Power PP1 = PWR_PP1_ENERGY / time +Power DRAM = PWR_DRAM_ENERGY / time +- +IvyBridge implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket), the PP0 domain +and DRAM level. The PP0 domain often refers to only the CPU cores. diff --git a/collectors/likwid/groups/ivybridge/FALSE_SHARE.txt b/collectors/likwid/groups/ivybridge/FALSE_SHARE.txt new file mode 100644 index 0000000..fbec3f4 --- /dev/null +++ b/collectors/likwid/groups/ivybridge/FALSE_SHARE.txt @@ -0,0 +1,25 @@ +SHORT False sharing + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM +PMC2 MEM_LOAD_UOPS_RETIRED_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Local LLC false sharing [MByte] 1.E-06*PMC0*64 +Local LLC false sharing rate PMC0/PMC2 + +LONG +Formulas: +Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM*64 +Local LLC false sharing rate = MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL +- +False-sharing of cache lines can dramatically reduce the performance of an +application. This performance group measures the L3 traffic induced by false-sharing. +The false-sharing rate uses all memory loads as reference. diff --git a/collectors/likwid/groups/ivybridge/FLOPS_AVX.txt b/collectors/likwid/groups/ivybridge/FLOPS_AVX.txt new file mode 100644 index 0000000..526d550 --- /dev/null +++ b/collectors/likwid/groups/ivybridge/FLOPS_AVX.txt @@ -0,0 +1,25 @@ +SHORT Packed AVX MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 SIMD_FP_256_PACKED_SINGLE +PMC1 SIMD_FP_256_PACKED_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Packed SP [MFLOP/s] 1.0E-06*(PMC0*8.0)/time +Packed DP [MFLOP/s] 1.0E-06*(PMC1*4.0)/time + +LONG +Formulas: +Packed SP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime +Packed DP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime +- +Packed 32b AVX FLOPs rates. Please note that the current FLOP measurements on IvyBridge are +potentially wrong. So you cannot trust these counters at the moment! + diff --git a/collectors/likwid/groups/ivybridge/FLOPS_DP.txt b/collectors/likwid/groups/ivybridge/FLOPS_DP.txt new file mode 100644 index 0000000..e737098 --- /dev/null +++ b/collectors/likwid/groups/ivybridge/FLOPS_DP.txt @@ -0,0 +1,33 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE +PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE +PMC2 SIMD_FP_256_PACKED_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time +AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) + +LONG +Formulas: +DP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime +AVX DP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime +Vectorization ratio = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE) +- +SSE scalar and packed double precision FLOP rates. Please note that the current +FLOP measurements on IvyBridge are potentially wrong. +So you cannot trust these counters at the moment! + diff --git a/collectors/likwid/groups/ivybridge/FLOPS_SP.txt b/collectors/likwid/groups/ivybridge/FLOPS_SP.txt new file mode 100644 index 0000000..7483722 --- /dev/null +++ b/collectors/likwid/groups/ivybridge/FLOPS_SP.txt @@ -0,0 +1,33 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE +PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE +PMC2 SIMD_FP_256_PACKED_SINGLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time +AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) + +LONG +Formulas: +SP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime +AVX SP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime +Vectorization ratio = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE) +- +SSE scalar and packed single precision FLOP rates. Please note that the current +FLOP measurements on IvyBridge are potentially wrong. +So you cannot trust these counters at the moment! + diff --git a/collectors/likwid/groups/ivybridge/ICACHE.txt b/collectors/likwid/groups/ivybridge/ICACHE.txt new file mode 100644 index 0000000..f1e2335 --- /dev/null +++ b/collectors/likwid/groups/ivybridge/ICACHE.txt @@ -0,0 +1,33 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ICACHE_ACCESSES +PMC1 ICACHE_MISSES +PMC2 ICACHE_IFETCH_STALL +PMC3 ILD_STALL_IQ_FULL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1I request rate PMC0/FIXC0 +L1I miss rate PMC1/FIXC0 +L1I miss ratio PMC1/PMC0 +L1I stalls PMC2 +L1I stall rate PMC2/FIXC0 +L1I queue full stalls PMC3 +L1I queue full stall rate PMC3/FIXC0 + +LONG +Formulas: +L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY +L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY +L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES +L1I stalls = ICACHE_IFETCH_STALL +L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY +- +This group measures some L1 instruction cache metrics. diff --git a/collectors/likwid/groups/ivybridge/L2.txt b/collectors/likwid/groups/ivybridge/L2.txt new file mode 100644 index 0000000..376e974 --- /dev/null +++ b/collectors/likwid/groups/ivybridge/L2.txt @@ -0,0 +1,38 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPLACEMENT +PMC1 L1D_M_EVICT +PMC2 ICACHE_MISSES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2D load data volume [GBytes] 1.0E-09*PMC0*64.0 +L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time +L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0 + +LONG +Formulas: +L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time +L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0 +L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time +L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0 +L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64/time +L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64 +- +Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L1 and the number of modified cache lines +evicted from the L1. The group also outputs total data volume transferred between +L2 and L1. Note that this bandwidth also includes data transfers due to a write +allocate load on a store miss in L1 and cache lines transferred it the instruction +cache. + diff --git a/collectors/likwid/groups/ivybridge/L2CACHE.txt b/collectors/likwid/groups/ivybridge/L2CACHE.txt new file mode 100644 index 0000000..9b5dd4b --- /dev/null +++ b/collectors/likwid/groups/ivybridge/L2CACHE.txt @@ -0,0 +1,34 @@ +SHORT L2 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_TRANS_ALL_REQUESTS +PMC1 L2_RQSTS_MISS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 request rate PMC0/FIXC0 +L2 miss rate PMC1/FIXC0 +L2 miss ratio PMC1/PMC0 + +LONG +Formulas: +L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY +L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY +L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS +- +This group measures the locality of your data accesses with regard to the +L2 cache. L2 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L2 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L2 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the# data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/ivybridge/L3.txt b/collectors/likwid/groups/ivybridge/L3.txt new file mode 100644 index 0000000..f0a8aad --- /dev/null +++ b/collectors/likwid/groups/ivybridge/L3.txt @@ -0,0 +1,36 @@ +SHORT L3 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_LINES_IN_ALL +PMC1 L2_LINES_OUT_DIRTY_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L3 load data volume [GBytes] 1.0E-09*PMC0*64.0 +L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 + +LONG +Formulas: +L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time +L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0 +L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DIRTY_ALL*64.0/time +L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DIRTY_ALL*64.0 +L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64/time +L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64 +- +Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L2 and the number of modified cache lines +evicted from the L2. This group also output data volume transferred between the +L3 and measured cores L2 caches. Note that this bandwidth also includes data +transfers due to a write allocate load on a store miss in L2. + diff --git a/collectors/likwid/groups/ivybridge/L3CACHE.txt b/collectors/likwid/groups/ivybridge/L3CACHE.txt new file mode 100644 index 0000000..9f3036f --- /dev/null +++ b/collectors/likwid/groups/ivybridge/L3CACHE.txt @@ -0,0 +1,36 @@ +SHORT L3 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_LOAD_UOPS_RETIRED_L3_ALL +PMC1 MEM_LOAD_UOPS_RETIRED_L3_MISS +PMC2 UOPS_RETIRED_ALL + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 request rate (PMC0)/PMC2 +L3 miss rate PMC1/PMC2 +L3 miss ratio PMC1/PMC0 + +LONG +Formulas: +L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL +L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL +L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL +- +This group measures the locality of your data accesses with regard to the +L3 cache. L3 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L3 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L3 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the# data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/ivybridge/PORT_USAGE.txt b/collectors/likwid/groups/ivybridge/PORT_USAGE.txt new file mode 100644 index 0000000..d509607 --- /dev/null +++ b/collectors/likwid/groups/ivybridge/PORT_USAGE.txt @@ -0,0 +1,40 @@ +SHORT Execution port utilization + +REQUIRE_NOHT + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_DISPATCHED_PORT_PORT_0 +PMC1 UOPS_DISPATCHED_PORT_PORT_1 +PMC2 UOPS_DISPATCHED_PORT_PORT_2 +PMC3 UOPS_DISPATCHED_PORT_PORT_3 +PMC4 UOPS_DISPATCHED_PORT_PORT_4 +PMC5 UOPS_DISPATCHED_PORT_PORT_5 + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Port0 usage ratio PMC0/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5) +Port1 usage ratio PMC1/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5) +Port2 usage ratio PMC2/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5) +Port3 usage ratio PMC3/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5) +Port4 usage ratio PMC4/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5) +Port5 usage ratio PMC5/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5) + +LONG +Formulas: +Port0 usage ratio = UOPS_DISPATCHED_PORT_PORT_0/SUM(UOPS_DISPATCHED_PORT_PORT_*) +Port1 usage ratio = UOPS_DISPATCHED_PORT_PORT_1/SUM(UOPS_DISPATCHED_PORT_PORT_*) +Port2 usage ratio = UOPS_DISPATCHED_PORT_PORT_2/SUM(UOPS_DISPATCHED_PORT_PORT_*) +Port3 usage ratio = UOPS_DISPATCHED_PORT_PORT_3/SUM(UOPS_DISPATCHED_PORT_PORT_*) +Port4 usage ratio = UOPS_DISPATCHED_PORT_PORT_4/SUM(UOPS_DISPATCHED_PORT_PORT_*) +Port5 usage ratio = UOPS_DISPATCHED_PORT_PORT_5/SUM(UOPS_DISPATCHED_PORT_PORT_*) +- +This group measures the execution port utilization in a CPU core. The group can +only be measured when HyperThreading is disabled because only then each CPU core +can program eight counters. diff --git a/collectors/likwid/groups/ivybridge/RECOVERY.txt b/collectors/likwid/groups/ivybridge/RECOVERY.txt new file mode 100644 index 0000000..7928ee4 --- /dev/null +++ b/collectors/likwid/groups/ivybridge/RECOVERY.txt @@ -0,0 +1,22 @@ +SHORT Recovery duration + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 INT_MISC_RECOVERY_CYCLES +PMC1 INT_MISC_RECOVERY_COUNT + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Avg. recovery duration PMC0/PMC1 + +LONG +Formulas: +Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT +- +This group measures the duration of recoveries after SSE exception, memory +disambiguation, etc... diff --git a/collectors/likwid/groups/ivybridge/TLB_DATA.txt b/collectors/likwid/groups/ivybridge/TLB_DATA.txt new file mode 100644 index 0000000..8d94e05 --- /dev/null +++ b/collectors/likwid/groups/ivybridge/TLB_DATA.txt @@ -0,0 +1,35 @@ +SHORT L2 data TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 DTLB_LOAD_MISSES_CAUSES_A_WALK +PMC1 DTLB_STORE_MISSES_CAUSES_A_WALK +PMC2 DTLB_LOAD_MISSES_WALK_DURATION +PMC3 DTLB_STORE_MISSES_WALK_DURATION + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 DTLB load misses PMC0 +L1 DTLB load miss rate PMC0/FIXC0 +L1 DTLB load miss duration [Cyc] PMC2/PMC0 +L1 DTLB store misses PMC1 +L1 DTLB store miss rate PMC1/FIXC0 +L1 DTLB store miss duration [Cyc] PMC3/PMC1 + +LONG +Formulas: +L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK +L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK +L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK +L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK +- +The DTLB load and store miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/ivybridge/TLB_INSTR.txt b/collectors/likwid/groups/ivybridge/TLB_INSTR.txt new file mode 100644 index 0000000..235d977 --- /dev/null +++ b/collectors/likwid/groups/ivybridge/TLB_INSTR.txt @@ -0,0 +1,28 @@ +SHORT L1 Instruction TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ITLB_MISSES_CAUSES_A_WALK +PMC1 ITLB_MISSES_WALK_DURATION + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 ITLB misses PMC0 +L1 ITLB miss rate PMC0/FIXC0 +L1 ITLB miss duration [Cyc] PMC1/PMC0 + + +LONG +Formulas: +L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK +L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK +- +The ITLB miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/ivybridge/TMA.txt b/collectors/likwid/groups/ivybridge/TMA.txt new file mode 100644 index 0000000..afb4126 --- /dev/null +++ b/collectors/likwid/groups/ivybridge/TMA.txt @@ -0,0 +1,48 @@ +SHORT Top down cycle allocation + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_ANY +PMC1 UOPS_RETIRED_RETIRE_SLOTS +PMC2 IDQ_UOPS_NOT_DELIVERED_CORE +PMC3 INT_MISC_RECOVERY_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +IPC FIXC0/FIXC1 +Total Slots 4*FIXC1 +Slots Retired PMC1 +Fetch Bubbles PMC2 +Recovery Bubbles 4*PMC3 +Front End [%] PMC2/(4*FIXC1)*100 +Speculation [%] (PMC0-PMC1+(4*PMC3))/(4*FIXC1)*100 +Retiring [%] PMC1/(4*FIXC1)*100 +Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100 + +LONG +Formulas: +Total Slots = 4*CPU_CLK_UNHALTED_CORE +Slots Retired = UOPS_RETIRED_RETIRE_SLOTS +Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE +Recovery Bubbles = 4*INT_MISC_RECOVERY_CYCLES +Front End [%] = IDQ_UOPS_NOT_DELIVERED_CORE/(4*CPU_CLK_UNHALTED_CORE)*100 +Speculation [%] = (UOPS_ISSUED_ANY-UOPS_RETIRED_RETIRE_SLOTS+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)*100 +Retiring [%] = UOPS_RETIRED_RETIRE_SLOTS/(4*CPU_CLK_UNHALTED_CORE)*100 +Back End [%] = (1-((IDQ_UOPS_NOT_DELIVERED_CORE+UOPS_ISSUED_ANY+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)))*100 +-- +This performance group measures cycles to determine percentage of time spent in +front end, back end, retiring and speculation. These metrics are published and +verified by Intel. Further information: +Webpage describing Top-Down Method and its usage in Intel vTune: +https://software.intel.com/en-us/vtune-amplifier-help-tuning-applications-using-a-top-down-microarchitecture-analysis-method +Paper by Yasin Ahmad: +https://sites.google.com/site/analysismethods/yasin-pubs/TopDown-Yasin-ISPASS14.pdf?attredirects=0 +Slides by Yasin Ahmad: +http://www.cs.technion.ac.il/~erangi/TMA_using_Linux_perf__Ahmad_Yasin.pdf +The performance group was originally published here: +http://perf.mvermeulen.com/2018/04/14/top-down-performance-counter-analysis-part-1-likwid/ diff --git a/collectors/likwid/groups/ivybridge/UOPS.txt b/collectors/likwid/groups/ivybridge/UOPS.txt new file mode 100644 index 0000000..e6cc208 --- /dev/null +++ b/collectors/likwid/groups/ivybridge/UOPS.txt @@ -0,0 +1,35 @@ +SHORT UOPs execution info + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_ANY +PMC1 UOPS_EXECUTED_THREAD +PMC2 UOPS_RETIRED_ALL +PMC3 UOPS_ISSUED_FLAGS_MERGE + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Issued UOPs PMC0 +Merged UOPs PMC3 +Executed UOPs PMC1 +Retired UOPs PMC2 + +LONG +Formulas: +Issued UOPs = UOPS_ISSUED_ANY +Merged UOPs = UOPS_ISSUED_FLAGS_MERGE +Executed UOPs = UOPS_EXECUTED_THREAD +Retired UOPs = UOPS_RETIRED_ALL +- +This group returns information about the instruction pipeline. It measures the +issued, executed and retired uOPs and returns the number of uOPs which were issued +but not executed as well as the number of uOPs which were executed but never retired. +The executed but not retired uOPs commonly come from speculatively executed branches. + diff --git a/collectors/likwid/groups/ivybridge/UOPS_EXEC.txt b/collectors/likwid/groups/ivybridge/UOPS_EXEC.txt new file mode 100644 index 0000000..7042df7 --- /dev/null +++ b/collectors/likwid/groups/ivybridge/UOPS_EXEC.txt @@ -0,0 +1,31 @@ +SHORT UOPs execution + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_EXECUTED_USED_CYCLES +PMC1 UOPS_EXECUTED_STALL_CYCLES +PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES +PMC3:EDGEDETECT UOPS_EXECUTED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio [%] 100*PMC0/PMC2 +Unused cycles ratio [%] 100*PMC1/PMC2 +Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT + + +LONG +Formulas: +Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT +- +This performance group returns the ratios of used and unused cycles regarding +the execution stage in the pipeline. Used cycles are all cycles where uOPs are +executed while unused cycles refer to pipeline stalls. Moreover, the group +calculates the average stall duration in cycles. diff --git a/collectors/likwid/groups/ivybridge/UOPS_ISSUE.txt b/collectors/likwid/groups/ivybridge/UOPS_ISSUE.txt new file mode 100644 index 0000000..9aac923 --- /dev/null +++ b/collectors/likwid/groups/ivybridge/UOPS_ISSUE.txt @@ -0,0 +1,31 @@ +SHORT UOPs issueing + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_USED_CYCLES +PMC1 UOPS_ISSUED_STALL_CYCLES +PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES +PMC3:EDGEDETECT UOPS_ISSUED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio [%] 100*PMC0/PMC2 +Unused cycles ratio [%] 100*PMC1/PMC2 +Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT + + +LONG +Formulas: +Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT +- +This performance group returns the ratios of used and unused cycles regarding +the issue stage in the pipeline. Used cycles are all cycles where uOPs are +issued while unused cycles refer to pipeline stalls. Moreover, the group +calculates the average stall duration in cycles. diff --git a/collectors/likwid/groups/ivybridge/UOPS_RETIRE.txt b/collectors/likwid/groups/ivybridge/UOPS_RETIRE.txt new file mode 100644 index 0000000..0f37585 --- /dev/null +++ b/collectors/likwid/groups/ivybridge/UOPS_RETIRE.txt @@ -0,0 +1,31 @@ +SHORT UOPs retirement + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_RETIRED_USED_CYCLES +PMC1 UOPS_RETIRED_STALL_CYCLES +PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES +PMC3:EDGEDETECT UOPS_RETIRED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio [%] 100*PMC0/PMC2 +Unused cycles ratio [%] 100*PMC1/PMC2 +Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT + + +LONG +Formulas: +Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT +- +This performance group returns the ratios of used and unused cycles regarding +the retirement stage in the pipeline (re-order buffer). Used cycles are all +cycles where uOPs are retired while unused cycles refer to pipeline stalls. +Moreover, the group calculates the average stall duration in cycles. diff --git a/collectors/likwid/groups/ivybridgeEP/BRANCH.txt b/collectors/likwid/groups/ivybridgeEP/BRANCH.txt new file mode 100644 index 0000000..b8d41b2 --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/BRANCH.txt @@ -0,0 +1,31 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 BR_INST_RETIRED_ALL_BRANCHES +PMC1 BR_MISP_RETIRED_ALL_BRANCHES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Branch rate PMC0/FIXC0 +Branch misprediction rate PMC1/FIXC0 +Branch misprediction ratio PMC1/PMC0 +Instructions per branch FIXC0/PMC0 + +LONG +Formulas: +Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES +Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/ivybridgeEP/CACHES.txt b/collectors/likwid/groups/ivybridgeEP/CACHES.txt new file mode 100644 index 0000000..fd1d43f --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/CACHES.txt @@ -0,0 +1,121 @@ +SHORT Cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPLACEMENT +PMC1 L1D_M_EVICT +PMC2 L2_LINES_IN_ALL +PMC3 L2_LINES_OUT_DIRTY_ALL +CBOX0C0:STATE=0x3F LLC_LOOKUP_DATA_READ +CBOX1C0:STATE=0x3F LLC_LOOKUP_DATA_READ +CBOX2C0:STATE=0x3F LLC_LOOKUP_DATA_READ +CBOX3C0:STATE=0x3F LLC_LOOKUP_DATA_READ +CBOX4C0:STATE=0x3F LLC_LOOKUP_DATA_READ +CBOX5C0:STATE=0x3F LLC_LOOKUP_DATA_READ +CBOX6C0:STATE=0x3F LLC_LOOKUP_DATA_READ +CBOX7C0:STATE=0x3F LLC_LOOKUP_DATA_READ +CBOX8C0:STATE=0x3F LLC_LOOKUP_DATA_READ +CBOX9C0:STATE=0x3F LLC_LOOKUP_DATA_READ +CBOX10C0:STATE=0x3F LLC_LOOKUP_DATA_READ +CBOX11C0:STATE=0x3F LLC_LOOKUP_DATA_READ +CBOX12C0:STATE=0x3F LLC_LOOKUP_DATA_READ +CBOX13C0:STATE=0x3F LLC_LOOKUP_DATA_READ +CBOX14C0:STATE=0x3F LLC_LOOKUP_DATA_READ +CBOX0C1 LLC_VICTIMS_M_STATE +CBOX1C1 LLC_VICTIMS_M_STATE +CBOX2C1 LLC_VICTIMS_M_STATE +CBOX3C1 LLC_VICTIMS_M_STATE +CBOX4C1 LLC_VICTIMS_M_STATE +CBOX5C1 LLC_VICTIMS_M_STATE +CBOX6C1 LLC_VICTIMS_M_STATE +CBOX7C1 LLC_VICTIMS_M_STATE +CBOX8C1 LLC_VICTIMS_M_STATE +CBOX9C1 LLC_VICTIMS_M_STATE +CBOX10C1 LLC_VICTIMS_M_STATE +CBOX11C1 LLC_VICTIMS_M_STATE +CBOX12C1 LLC_VICTIMS_M_STATE +CBOX13C1 LLC_VICTIMS_M_STATE +CBOX14C1 LLC_VICTIMS_M_STATE +MBOX0C0 CAS_COUNT_RD +MBOX0C1 CAS_COUNT_WR +MBOX1C0 CAS_COUNT_RD +MBOX1C1 CAS_COUNT_WR +MBOX2C0 CAS_COUNT_RD +MBOX2C1 CAS_COUNT_WR +MBOX3C0 CAS_COUNT_RD +MBOX3C1 CAS_COUNT_WR +MBOX4C0 CAS_COUNT_RD +MBOX4C1 CAS_COUNT_WR +MBOX5C0 CAS_COUNT_RD +MBOX5C1 CAS_COUNT_WR +MBOX6C0 CAS_COUNT_RD +MBOX6C1 CAS_COUNT_WR +MBOX7C0 CAS_COUNT_RD +MBOX7C1 CAS_COUNT_WR + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0 +L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 +L3 to L2 load bandwidth [MBytes/s] 1.0E-06*PMC2*64.0/time +L3 to L2 load data volume [GBytes] 1.0E-09*PMC2*64.0 +L2 to L3 evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time +L2 to L3 evict data volume [GBytes] 1.0E-09*PMC3*64.0 +L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time +L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0 +System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F+CBOX8C0:STATE=0x3F+CBOX9C0:STATE=0x3F+CBOX10C0:STATE=0x3F+CBOX11C0:STATE=0x3F+CBOX12C0:STATE=0x3F+CBOX13C0:STATE=0x3F+CBOX14C0:STATE=0x3F)*64.0/time +System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F+CBOX8C0:STATE=0x3F+CBOX9C0:STATE=0x3F+ CBOX10C0:STATE=0x3F+CBOX11C0:STATE=0x3F+CBOX12C0:STATE=0x3F+CBOX13C0:STATE=0x3F+CBOX14C0:STATE=0x3F)*64.0 +L3 to memory bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1)*64/time +L3 to memory data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1)*64 +L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F+CBOX8C0:STATE=0x3F+CBOX9C0:STATE=0x3F+ CBOX10C0:STATE=0x3F+CBOX11C0:STATE=0x3F+CBOX12C0:STATE=0x3F+CBOX13C0:STATE=0x3F+CBOX14C0:STATE=0x3F+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1)*64.0/time +L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F+CBOX8C0:STATE=0x3F+CBOX9C0:STATE=0x3F+ CBOX10C0:STATE=0x3F+CBOX11C0:STATE=0x3F+CBOX12C0:STATE=0x3F+CBOX13C0:STATE=0x3F+CBOX14C0:STATE=0x3F+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1)*64.0 +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 + +LONG +Formulas: +L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time +L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64 +L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time +L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64 +L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time +L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64 +L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time +L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64 +L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time +L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64 +L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time +L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64 +System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F))*64/time +System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F))*64 +L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M_STATE))*64/time +L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M_STATE))*64 +L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F)+SUM(LLC_VICTIMS_M_STATE))*64/time +L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F)+SUM(LLC_VICTIMS_M_STATE))*64 +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time +Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time +Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time +Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0 +- +Group to measure cache transfers between L1 and Memory. Please notice that the +L3 to/from system metrics contain any traffic to the system (memory, +Intel QPI, etc.) but don't seem to handle anything because commonly memory read +bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth. + diff --git a/collectors/likwid/groups/ivybridgeEP/CBOX.txt b/collectors/likwid/groups/ivybridgeEP/CBOX.txt new file mode 100644 index 0000000..5c87149 --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/CBOX.txt @@ -0,0 +1,55 @@ +SHORT CBOX related data and metrics + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +CBOX0C0 LLC_VICTIMS_M_STATE +CBOX1C0 LLC_VICTIMS_M_STATE +CBOX2C0 LLC_VICTIMS_M_STATE +CBOX3C0 LLC_VICTIMS_M_STATE +CBOX4C0 LLC_VICTIMS_M_STATE +CBOX5C0 LLC_VICTIMS_M_STATE +CBOX6C0 LLC_VICTIMS_M_STATE +CBOX7C0 LLC_VICTIMS_M_STATE +CBOX8C0 LLC_VICTIMS_M_STATE +CBOX9C0 LLC_VICTIMS_M_STATE +CBOX10C0 LLC_VICTIMS_M_STATE +CBOX11C0 LLC_VICTIMS_M_STATE +CBOX12C0 LLC_VICTIMS_M_STATE +CBOX13C0 LLC_VICTIMS_M_STATE +CBOX14C0 LLC_VICTIMS_M_STATE +CBOX0C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX1C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX2C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX3C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX4C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX5C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX6C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX7C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX8C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX9C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX10C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX11C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX12C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX13C1:STATE=0x1 LLC_LOOKUP_ANY +CBOX14C1:STATE=0x1 LLC_LOOKUP_ANY + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +LLC misses per instruction (CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0)/FIXC0 +LLC data written to MEM [MBytes] 1E-6*(CBOX0C1:STATE=0x1+CBOX1C1:STATE=0x1+CBOX2C1:STATE=0x1+CBOX3C1:STATE=0x1+CBOX4C1:STATE=0x1+CBOX5C1:STATE=0x1+CBOX6C1:STATE=0x1+CBOX7C1:STATE=0x1+CBOX8C1:STATE=0x1+CBOX9C1:STATE=0x1+CBOX10C1:STATE=0x1+CBOX11C1:STATE=0x1+CBOX12C1:STATE=0x1+CBOX13C1:STATE=0x1+CBOX14C1:STATE=0x1)*64 + + +LONG +Formulas: +LLC misses per instruction = sum(LLC_VICTIMS_M_STATE)/INSTR_RETIRED_ANY +LLC data written to MEM [MBytes] = sum(LLC_LOOKUP_ANY:STATE=0x1)*64*1E-6 +-- +The CBOXes mediate the traffic from the L2 cache to the segmented L3 cache. Each +CBOX is responsible for one segment (2.5 MByte). The boxes maintain the coherence between all +CPU cores of the socket. Depending on the CPU core count, some CBOXes are not attached +to a 2.5 MByte slice but are still active and track the traffic. diff --git a/collectors/likwid/groups/ivybridgeEP/CLOCK.txt b/collectors/likwid/groups/ivybridgeEP/CLOCK.txt new file mode 100644 index 0000000..fb19101 --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/CLOCK.txt @@ -0,0 +1,26 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY +UBOXFIX UNCORE_CLOCK + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +Uncore Clock [MHz] 1.E-06*UBOXFIX/time +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time +- +IvyBridge implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) level. + diff --git a/collectors/likwid/groups/ivybridgeEP/CYCLE_ACTIVITY.txt b/collectors/likwid/groups/ivybridgeEP/CYCLE_ACTIVITY.txt new file mode 100644 index 0000000..c432a44 --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/CYCLE_ACTIVITY.txt @@ -0,0 +1,38 @@ +SHORT Cycle Activities + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING +PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING +PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING +PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Cycles without execution [%] (PMC3/FIXC1)*100 +Cycles without execution due to L1D [%] (PMC2/FIXC1)*100 +Cycles without execution due to L2 [%] (PMC0/FIXC1)*100 +Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100 + +LONG +Formulas: +Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100 +Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100 +Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100 +Cycles without execution due to memory loads [%] = CYCLE_ACTIVITY_CYCLES_LDM_PENDING/CPU_CLK_UNHALTED_CORE*100 +-- +This performance group measures the cycles while waiting for data from the cache +and memory hierarchy. +CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on +any execution port. +CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is +outstanding. +CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is +outstanding. +CYCLE_ACTIVITY_CYCLES_LDM_PENDING: Cycles while memory subsystem has an +outstanding load. diff --git a/collectors/likwid/groups/ivybridgeEP/CYCLE_STALLS.txt b/collectors/likwid/groups/ivybridgeEP/CYCLE_STALLS.txt new file mode 100644 index 0000000..795aeb9 --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/CYCLE_STALLS.txt @@ -0,0 +1,45 @@ +SHORT Cycle Activities (Stalls) + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING +PMC1 CYCLE_ACTIVITY_STALLS_LDM_PENDING +PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING +PMC3 CYCLE_ACTIVITY_STALLS_TOTAL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Total execution stalls PMC3 +Stalls caused by L1D misses [%] (PMC2/PMC3)*100 +Stalls caused by L2 misses [%] (PMC0/PMC3)*100 +Stalls caused by memory loads [%] (PMC1/PMC3)*100 +Execution stall rate [%] (PMC3/FIXC1)*100 +Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100 +Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100 +Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100 + +LONG +Formulas: +Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL +Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Stalls caused by memory loads [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by memory loads rate [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CPU_CLK_UNHALTED_CORE)*100 +-- +This performance group measures the stalls caused by data traffic in the cache +hierarchy. +CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls. +CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand +load is outstanding. +CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand +load is outstanding. +CYCLE_ACTIVITY_STALLS_LDM_PENDING: Execution stalls while memory subsystem has +an outstanding load. diff --git a/collectors/likwid/groups/ivybridgeEP/DATA.txt b/collectors/likwid/groups/ivybridgeEP/DATA.txt new file mode 100644 index 0000000..967cbad --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/DATA.txt @@ -0,0 +1,22 @@ +SHORT Load to store ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_UOPS_RETIRED_LOADS +PMC1 MEM_UOPS_RETIRED_STORES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Load to store ratio PMC0/PMC1 + +LONG +Formulas: +Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES +- +This is a metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/ivybridgeEP/DIVIDE.txt b/collectors/likwid/groups/ivybridgeEP/DIVIDE.txt new file mode 100644 index 0000000..f8cb0b3 --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/DIVIDE.txt @@ -0,0 +1,24 @@ +SHORT Divide unit information + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ARITH_NUM_DIV +PMC1 ARITH_FPU_DIV_ACTIVE + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Number of divide ops PMC0 +Avg. divide unit usage duration PMC1/PMC0 + +LONG +Formulas: +Number of divide ops = ARITH_NUM_DIV +Avg. divide unit usage duration = ARITH_FPU_DIV_ACTIVE/ARITH_NUM_DIV +- +This performance group measures the average latency of divide operations diff --git a/collectors/likwid/groups/ivybridgeEP/ENERGY.txt b/collectors/likwid/groups/ivybridgeEP/ENERGY.txt new file mode 100644 index 0000000..74c16bb --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/ENERGY.txt @@ -0,0 +1,33 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +TMP0 TEMP_CORE +PWR0 PWR_PKG_ENERGY +PWR1 PWR_PP0_ENERGY +PWR3 PWR_DRAM_ENERGY + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Temperature [C] TMP0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy PP0 [J] PWR1 +Power PP0 [W] PWR1/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Power PP0 = PWR_PP0_ENERGY / time +Power DRAM = PWR_DRAM_ENERGY / time +- +IvyBridge implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket), the PP0 domain +and DRAM level. The PP0 domain often refers to only the CPU cores. diff --git a/collectors/likwid/groups/ivybridgeEP/FALSE_SHARE.txt b/collectors/likwid/groups/ivybridgeEP/FALSE_SHARE.txt new file mode 100644 index 0000000..5e28a15 --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/FALSE_SHARE.txt @@ -0,0 +1,32 @@ +SHORT False sharing + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM +PMC1 MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_HITM +PMC2 MEM_LOAD_UOPS_RETIRED_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Local LLC false sharing [MByte] 1.E-06*PMC0*64 +Local LLC false sharing rate PMC0/PMC2 +Remote LLC false sharing [MByte] 1.E-06*PMC1*64 +Remote LLC false sharing rate PMC1/PMC2 + +LONG +Formulas: +Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM*64 +Local LLC false sharing rate = MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL +Remote LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_HITM*64 +Remote LLC false sharing rate = MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_HITM/MEM_LOAD_UOPS_RETIRED_ALL +- +False-sharing of cache lines can dramatically reduce the performance of an +application. This performance group measures the L3 traffic induced by false-sharing. +The false-sharing rate uses all memory loads as reference. +For systems with multiple CPU sockets, this performance group also measures the +false-sharing of cache lines over socket boundaries. diff --git a/collectors/likwid/groups/ivybridgeEP/FLOPS_AVX.txt b/collectors/likwid/groups/ivybridgeEP/FLOPS_AVX.txt new file mode 100644 index 0000000..0ad669f --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/FLOPS_AVX.txt @@ -0,0 +1,26 @@ +SHORT Packed AVX MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 SIMD_FP_256_PACKED_SINGLE +PMC1 SIMD_FP_256_PACKED_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Packed SP [MFLOP/s] 1.0E-06*(PMC0*8.0)/time +Packed DP [MFLOP/s] 1.0E-06*(PMC1*4.0)/time + +LONG +Formulas: +Packed SP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime +Packed DP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime +- +Packed 32b AVX FLOPs rates. Please note that the current FLOP measurements on +IvyBridge are potentially wrong. +So you cannot trust these counters at the moment! + diff --git a/collectors/likwid/groups/ivybridgeEP/FLOPS_DP.txt b/collectors/likwid/groups/ivybridgeEP/FLOPS_DP.txt new file mode 100644 index 0000000..e737098 --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/FLOPS_DP.txt @@ -0,0 +1,33 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE +PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE +PMC2 SIMD_FP_256_PACKED_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time +AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) + +LONG +Formulas: +DP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime +AVX DP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime +Vectorization ratio = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE) +- +SSE scalar and packed double precision FLOP rates. Please note that the current +FLOP measurements on IvyBridge are potentially wrong. +So you cannot trust these counters at the moment! + diff --git a/collectors/likwid/groups/ivybridgeEP/FLOPS_SP.txt b/collectors/likwid/groups/ivybridgeEP/FLOPS_SP.txt new file mode 100644 index 0000000..7483722 --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/FLOPS_SP.txt @@ -0,0 +1,33 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE +PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE +PMC2 SIMD_FP_256_PACKED_SINGLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time +AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) + +LONG +Formulas: +SP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime +AVX SP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime +Vectorization ratio = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE) +- +SSE scalar and packed single precision FLOP rates. Please note that the current +FLOP measurements on IvyBridge are potentially wrong. +So you cannot trust these counters at the moment! + diff --git a/collectors/likwid/groups/ivybridgeEP/ICACHE.txt b/collectors/likwid/groups/ivybridgeEP/ICACHE.txt new file mode 100644 index 0000000..f1e2335 --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/ICACHE.txt @@ -0,0 +1,33 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ICACHE_ACCESSES +PMC1 ICACHE_MISSES +PMC2 ICACHE_IFETCH_STALL +PMC3 ILD_STALL_IQ_FULL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1I request rate PMC0/FIXC0 +L1I miss rate PMC1/FIXC0 +L1I miss ratio PMC1/PMC0 +L1I stalls PMC2 +L1I stall rate PMC2/FIXC0 +L1I queue full stalls PMC3 +L1I queue full stall rate PMC3/FIXC0 + +LONG +Formulas: +L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY +L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY +L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES +L1I stalls = ICACHE_IFETCH_STALL +L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY +- +This group measures some L1 instruction cache metrics. diff --git a/collectors/likwid/groups/ivybridgeEP/L2.txt b/collectors/likwid/groups/ivybridgeEP/L2.txt new file mode 100644 index 0000000..376e974 --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/L2.txt @@ -0,0 +1,38 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPLACEMENT +PMC1 L1D_M_EVICT +PMC2 ICACHE_MISSES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2D load data volume [GBytes] 1.0E-09*PMC0*64.0 +L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time +L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0 + +LONG +Formulas: +L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time +L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0 +L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time +L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0 +L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64/time +L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64 +- +Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L1 and the number of modified cache lines +evicted from the L1. The group also outputs total data volume transferred between +L2 and L1. Note that this bandwidth also includes data transfers due to a write +allocate load on a store miss in L1 and cache lines transferred it the instruction +cache. + diff --git a/collectors/likwid/groups/ivybridgeEP/L2CACHE.txt b/collectors/likwid/groups/ivybridgeEP/L2CACHE.txt new file mode 100644 index 0000000..9b5dd4b --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/L2CACHE.txt @@ -0,0 +1,34 @@ +SHORT L2 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_TRANS_ALL_REQUESTS +PMC1 L2_RQSTS_MISS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 request rate PMC0/FIXC0 +L2 miss rate PMC1/FIXC0 +L2 miss ratio PMC1/PMC0 + +LONG +Formulas: +L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY +L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY +L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS +- +This group measures the locality of your data accesses with regard to the +L2 cache. L2 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L2 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L2 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the# data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/ivybridgeEP/L3.txt b/collectors/likwid/groups/ivybridgeEP/L3.txt new file mode 100644 index 0000000..f0a8aad --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/L3.txt @@ -0,0 +1,36 @@ +SHORT L3 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_LINES_IN_ALL +PMC1 L2_LINES_OUT_DIRTY_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L3 load data volume [GBytes] 1.0E-09*PMC0*64.0 +L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 + +LONG +Formulas: +L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time +L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0 +L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DIRTY_ALL*64.0/time +L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DIRTY_ALL*64.0 +L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64/time +L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64 +- +Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L2 and the number of modified cache lines +evicted from the L2. This group also output data volume transferred between the +L3 and measured cores L2 caches. Note that this bandwidth also includes data +transfers due to a write allocate load on a store miss in L2. + diff --git a/collectors/likwid/groups/ivybridgeEP/L3CACHE.txt b/collectors/likwid/groups/ivybridgeEP/L3CACHE.txt new file mode 100644 index 0000000..9f3036f --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/L3CACHE.txt @@ -0,0 +1,36 @@ +SHORT L3 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_LOAD_UOPS_RETIRED_L3_ALL +PMC1 MEM_LOAD_UOPS_RETIRED_L3_MISS +PMC2 UOPS_RETIRED_ALL + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 request rate (PMC0)/PMC2 +L3 miss rate PMC1/PMC2 +L3 miss ratio PMC1/PMC0 + +LONG +Formulas: +L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL +L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL +L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL +- +This group measures the locality of your data accesses with regard to the +L3 cache. L3 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L3 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L3 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the# data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/ivybridgeEP/MEM.txt b/collectors/likwid/groups/ivybridgeEP/MEM.txt new file mode 100644 index 0000000..fd80c2c --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/MEM.txt @@ -0,0 +1,49 @@ +SHORT Main memory bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +MBOX0C0 CAS_COUNT_RD +MBOX0C1 CAS_COUNT_WR +MBOX1C0 CAS_COUNT_RD +MBOX1C1 CAS_COUNT_WR +MBOX2C0 CAS_COUNT_RD +MBOX2C1 CAS_COUNT_WR +MBOX3C0 CAS_COUNT_RD +MBOX3C1 CAS_COUNT_WR +MBOX4C0 CAS_COUNT_RD +MBOX4C1 CAS_COUNT_WR +MBOX5C0 CAS_COUNT_RD +MBOX5C1 CAS_COUNT_WR +MBOX6C0 CAS_COUNT_RD +MBOX6C1 CAS_COUNT_WR +MBOX7C0 CAS_COUNT_RD +MBOX7C1 CAS_COUNT_WR + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 + +LONG +Formulas: +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time +Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time +Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time +Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0 +- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Since this group is based on Uncore events it is only possible to measure on a +per socket base. Some of the counters may not be available on your system. +Also outputs total data volume transferred from main memory. + diff --git a/collectors/likwid/groups/ivybridgeEP/MEM_DP.txt b/collectors/likwid/groups/ivybridgeEP/MEM_DP.txt new file mode 100644 index 0000000..eff1677 --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/MEM_DP.txt @@ -0,0 +1,75 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY +PWR3 PWR_DRAM_ENERGY +PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE +PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE +PMC2 SIMD_FP_256_PACKED_DOUBLE +MBOX0C0 CAS_COUNT_RD +MBOX0C1 CAS_COUNT_WR +MBOX1C0 CAS_COUNT_RD +MBOX1C1 CAS_COUNT_WR +MBOX2C0 CAS_COUNT_RD +MBOX2C1 CAS_COUNT_WR +MBOX3C0 CAS_COUNT_RD +MBOX3C1 CAS_COUNT_WR +MBOX4C0 CAS_COUNT_RD +MBOX4C1 CAS_COUNT_WR +MBOX5C0 CAS_COUNT_RD +MBOX5C1 CAS_COUNT_WR +MBOX6C0 CAS_COUNT_RD +MBOX6C1 CAS_COUNT_WR +MBOX7C0 CAS_COUNT_RD +MBOX7C1 CAS_COUNT_WR + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time +MFLOP/s 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time +AVX [MFLOP/s] 1.0E-06*(PMC2*4.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 +Operational intensity (PMC0*2.0+PMC1+PMC2*4.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0) + +LONG +Formulas: +Power [W] = PWR_PKG_ENERGY/runtime +Power DRAM [W] = PWR_DRAM_ENERGY/runtime +MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime +AVX [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time +Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time +Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time +Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0 +Operational intensity = (FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0) +-- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Since this group is based on Uncore events it is only possible to measure on +a per socket base. Also outputs total data volume transferred from main memory. +SSE scalar and packed double precision FLOP rates. Also reports on packed AVX +32b instructions. Please note that the current FLOP measurements on SandyBridge +are potentially wrong. So you cannot trust these counters at the moment! +The operational intensity is calculated using the FP values of the cores and the +memory data volume of the whole socket. The actual operational intensity for +multiple CPUs can be found in the statistics table in the Sum column. + diff --git a/collectors/likwid/groups/ivybridgeEP/MEM_SP.txt b/collectors/likwid/groups/ivybridgeEP/MEM_SP.txt new file mode 100644 index 0000000..e541340 --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/MEM_SP.txt @@ -0,0 +1,74 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY +PWR3 PWR_DRAM_ENERGY +PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE +PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE +PMC2 SIMD_FP_256_PACKED_SINGLE +MBOX0C0 CAS_COUNT_RD +MBOX0C1 CAS_COUNT_WR +MBOX1C0 CAS_COUNT_RD +MBOX1C1 CAS_COUNT_WR +MBOX2C0 CAS_COUNT_RD +MBOX2C1 CAS_COUNT_WR +MBOX3C0 CAS_COUNT_RD +MBOX3C1 CAS_COUNT_WR +MBOX4C0 CAS_COUNT_RD +MBOX4C1 CAS_COUNT_WR +MBOX5C0 CAS_COUNT_RD +MBOX5C1 CAS_COUNT_WR +MBOX6C0 CAS_COUNT_RD +MBOX6C1 CAS_COUNT_WR +MBOX7C0 CAS_COUNT_RD +MBOX7C1 CAS_COUNT_WR + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time +MFLOP/s 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time +AVX [MFLOP/s] 1.0E-06*(PMC2*8.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 +Operational intensity (PMC0*4.0+PMC1+PMC2*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0) + +LONG +Formulas: +Power [W] = PWR_PKG_ENERGY/runtime +Power DRAM [W] = PWR_DRAM_ENERGY/runtime +MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime +AVX [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time +Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time +Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time +Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0 +Operational intensity = (FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0) +-- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Since this group is based on Uncore events it is only possible to measure on +a per socket base. Also outputs total data volume transferred from main memory. +SSE scalar and packed single precision FLOP rates. Also reports on packed AVX +32b instructions. Please note that the current FLOP measurements on IvyBridge +are potentially wrong. So you cannot trust these counters at the moment! +The operational intensity is calculated using the FP values of the cores and the +memory data volume of the whole socket. The actual operational intensity for +multiple CPUs can be found in the statistics table in the Sum column. diff --git a/collectors/likwid/groups/ivybridgeEP/NUMA.txt b/collectors/likwid/groups/ivybridgeEP/NUMA.txt new file mode 100644 index 0000000..41fbe62 --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/NUMA.txt @@ -0,0 +1,33 @@ +SHORT Local and remote memory accesses + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 OFFCORE_RESPONSE_0_LOCAL_DRAM +PMC1 OFFCORE_RESPONSE_1_REMOTE_DRAM + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Local DRAM data volume [GByte] 1.E-09*PMC0*64 +Local DRAM bandwidth [MByte/s] 1.E-06*(PMC0*64)/time +Remote DRAM data volume [GByte] 1.E-09*PMC1*64 +Remote DRAM bandwidth [MByte/s] 1.E-06*(PMC1*64)/time +Memory data volume [GByte] 1.E-09*(PMC0+PMC1)*64 +Memory bandwidth [MByte/s] 1.E-06*((PMC0+PMC1)*64)/time + +LONG +Formulas: +CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY +Local DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_0_LOCAL_DRAM*64 +Local DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_0_LOCAL_DRAM*64)/time +Remote DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_1_REMOTE_DRAM*64 +Remote DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_1_REMOTE_DRAM*64)/time +Memory data volume [GByte] = 1.E-09*(OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64 +Memory bandwidth [MByte/s] = 1.E-06*((OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64)/time +-- +This performance group measures the data traffic of CPU cores to local and remote +memory. diff --git a/collectors/likwid/groups/ivybridgeEP/PORT_USAGE.txt b/collectors/likwid/groups/ivybridgeEP/PORT_USAGE.txt new file mode 100644 index 0000000..d509607 --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/PORT_USAGE.txt @@ -0,0 +1,40 @@ +SHORT Execution port utilization + +REQUIRE_NOHT + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_DISPATCHED_PORT_PORT_0 +PMC1 UOPS_DISPATCHED_PORT_PORT_1 +PMC2 UOPS_DISPATCHED_PORT_PORT_2 +PMC3 UOPS_DISPATCHED_PORT_PORT_3 +PMC4 UOPS_DISPATCHED_PORT_PORT_4 +PMC5 UOPS_DISPATCHED_PORT_PORT_5 + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Port0 usage ratio PMC0/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5) +Port1 usage ratio PMC1/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5) +Port2 usage ratio PMC2/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5) +Port3 usage ratio PMC3/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5) +Port4 usage ratio PMC4/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5) +Port5 usage ratio PMC5/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5) + +LONG +Formulas: +Port0 usage ratio = UOPS_DISPATCHED_PORT_PORT_0/SUM(UOPS_DISPATCHED_PORT_PORT_*) +Port1 usage ratio = UOPS_DISPATCHED_PORT_PORT_1/SUM(UOPS_DISPATCHED_PORT_PORT_*) +Port2 usage ratio = UOPS_DISPATCHED_PORT_PORT_2/SUM(UOPS_DISPATCHED_PORT_PORT_*) +Port3 usage ratio = UOPS_DISPATCHED_PORT_PORT_3/SUM(UOPS_DISPATCHED_PORT_PORT_*) +Port4 usage ratio = UOPS_DISPATCHED_PORT_PORT_4/SUM(UOPS_DISPATCHED_PORT_PORT_*) +Port5 usage ratio = UOPS_DISPATCHED_PORT_PORT_5/SUM(UOPS_DISPATCHED_PORT_PORT_*) +- +This group measures the execution port utilization in a CPU core. The group can +only be measured when HyperThreading is disabled because only then each CPU core +can program eight counters. diff --git a/collectors/likwid/groups/ivybridgeEP/QPI.txt b/collectors/likwid/groups/ivybridgeEP/QPI.txt new file mode 100644 index 0000000..a2f1339 --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/QPI.txt @@ -0,0 +1,52 @@ +SHORT QPI Link Layer data + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +SBOX0C0 DIRECT2CORE_SUCCESS_RBT_HIT +SBOX1C0 DIRECT2CORE_SUCCESS_RBT_HIT +SBOX2C0 DIRECT2CORE_SUCCESS_RBT_HIT +SBOX0C1 TXL_FLITS_G0_DATA +SBOX1C1 TXL_FLITS_G0_DATA +SBOX2C1 TXL_FLITS_G0_DATA +SBOX0C2 TXL_FLITS_G0_NON_DATA +SBOX1C2 TXL_FLITS_G0_NON_DATA +SBOX2C2 TXL_FLITS_G0_NON_DATA +SBOX0C3 SBOX_CLOCKTICKS +SBOX1C3 SBOX_CLOCKTICKS +SBOX2C3 SBOX_CLOCKTICKS +SBOX0FIX QPI_RATE +SBOX1FIX QPI_RATE +SBOX2FIX QPI_RATE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +QPI Speed Link 0 [GT/s] ((SBOX0C3)/time)*inverseClock*(8/1000) +QPI Speed Link 1 [GT/s] ((SBOX1C3)/time)*inverseClock*(8/1000) +QPI Speed Link 2 [GT/s] ((SBOX2C3)/time)*inverseClock*(8/1000) +QPI Rate Link 0 [GT/s] 1.E-09*SBOX0FIX +QPI Rate Link 1 [GT/s] 1.E-09*SBOX1FIX +QPI Rate Link 2 [GT/s] 1.E-09*SBOX2FIX +data from QPI to LLC [MByte] 1.E-06*(SBOX0C0+SBOX1C0+SBOX2C0)*8 +QPI data volume [MByte] 1.E-06*(SBOX0C1+SBOX1C1+SBOX2C1)*8 +QPI data bandwidth [MByte/s] 1.E-06*(SBOX0C1+SBOX1C1+SBOX2C1)*8/time +QPI link volume [MByte] 1.E-06*(SBOX0C1+SBOX1C1+SBOX2C1+SBOX0C2+SBOX1C2+SBOX2C2)*8 +QPI link bandwidth [MByte/s] 1.E-06*(SBOX0C1+SBOX1C1+SBOX2C1+SBOX0C2+SBOX1C2+SBOX2C2)*8/time + +LONG +Formulas: +QPI Speed Link 0/1/2 [GT/s] = ((SBOX_CLOCKTICKS)/time)*clock*(8/1000) +QPI Rate Link 0/1/2 [GT/s] = 1.E-09*(QPI_RATE) +data from QPI to LLC [MByte] = 1.E-06*(sum(DIRECT2CORE_SUCCESS_RBT_HIT)*64) +QPI data volume [MByte] = 1.E-06*(sum(TXL_FLITS_G0_DATA)*8) +QPI data bandwidth [MByte/s] = 1.E-06*(sum(TXL_FLITS_G0_DATA)*8)/runtime +QPI link volume [MByte] = 1.E-06*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8) +QPI link bandwidth [MByte/s] = 1.E-06*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8)/runtime +-- +The Intel QPI Link Layer is responsible for packetizing requests from the caching agent (CBOXes) +on the way out to the system interface. + diff --git a/collectors/likwid/groups/ivybridgeEP/RECOVERY.txt b/collectors/likwid/groups/ivybridgeEP/RECOVERY.txt new file mode 100644 index 0000000..7928ee4 --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/RECOVERY.txt @@ -0,0 +1,22 @@ +SHORT Recovery duration + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 INT_MISC_RECOVERY_CYCLES +PMC1 INT_MISC_RECOVERY_COUNT + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Avg. recovery duration PMC0/PMC1 + +LONG +Formulas: +Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT +- +This group measures the duration of recoveries after SSE exception, memory +disambiguation, etc... diff --git a/collectors/likwid/groups/ivybridgeEP/TLB_DATA.txt b/collectors/likwid/groups/ivybridgeEP/TLB_DATA.txt new file mode 100644 index 0000000..8d94e05 --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/TLB_DATA.txt @@ -0,0 +1,35 @@ +SHORT L2 data TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 DTLB_LOAD_MISSES_CAUSES_A_WALK +PMC1 DTLB_STORE_MISSES_CAUSES_A_WALK +PMC2 DTLB_LOAD_MISSES_WALK_DURATION +PMC3 DTLB_STORE_MISSES_WALK_DURATION + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 DTLB load misses PMC0 +L1 DTLB load miss rate PMC0/FIXC0 +L1 DTLB load miss duration [Cyc] PMC2/PMC0 +L1 DTLB store misses PMC1 +L1 DTLB store miss rate PMC1/FIXC0 +L1 DTLB store miss duration [Cyc] PMC3/PMC1 + +LONG +Formulas: +L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK +L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK +L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK +L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK +- +The DTLB load and store miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/ivybridgeEP/TLB_INSTR.txt b/collectors/likwid/groups/ivybridgeEP/TLB_INSTR.txt new file mode 100644 index 0000000..235d977 --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/TLB_INSTR.txt @@ -0,0 +1,28 @@ +SHORT L1 Instruction TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ITLB_MISSES_CAUSES_A_WALK +PMC1 ITLB_MISSES_WALK_DURATION + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 ITLB misses PMC0 +L1 ITLB miss rate PMC0/FIXC0 +L1 ITLB miss duration [Cyc] PMC1/PMC0 + + +LONG +Formulas: +L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK +L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK +- +The ITLB miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/ivybridgeEP/TMA.txt b/collectors/likwid/groups/ivybridgeEP/TMA.txt new file mode 100644 index 0000000..afb4126 --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/TMA.txt @@ -0,0 +1,48 @@ +SHORT Top down cycle allocation + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_ANY +PMC1 UOPS_RETIRED_RETIRE_SLOTS +PMC2 IDQ_UOPS_NOT_DELIVERED_CORE +PMC3 INT_MISC_RECOVERY_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +IPC FIXC0/FIXC1 +Total Slots 4*FIXC1 +Slots Retired PMC1 +Fetch Bubbles PMC2 +Recovery Bubbles 4*PMC3 +Front End [%] PMC2/(4*FIXC1)*100 +Speculation [%] (PMC0-PMC1+(4*PMC3))/(4*FIXC1)*100 +Retiring [%] PMC1/(4*FIXC1)*100 +Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100 + +LONG +Formulas: +Total Slots = 4*CPU_CLK_UNHALTED_CORE +Slots Retired = UOPS_RETIRED_RETIRE_SLOTS +Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE +Recovery Bubbles = 4*INT_MISC_RECOVERY_CYCLES +Front End [%] = IDQ_UOPS_NOT_DELIVERED_CORE/(4*CPU_CLK_UNHALTED_CORE)*100 +Speculation [%] = (UOPS_ISSUED_ANY-UOPS_RETIRED_RETIRE_SLOTS+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)*100 +Retiring [%] = UOPS_RETIRED_RETIRE_SLOTS/(4*CPU_CLK_UNHALTED_CORE)*100 +Back End [%] = (1-((IDQ_UOPS_NOT_DELIVERED_CORE+UOPS_ISSUED_ANY+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)))*100 +-- +This performance group measures cycles to determine percentage of time spent in +front end, back end, retiring and speculation. These metrics are published and +verified by Intel. Further information: +Webpage describing Top-Down Method and its usage in Intel vTune: +https://software.intel.com/en-us/vtune-amplifier-help-tuning-applications-using-a-top-down-microarchitecture-analysis-method +Paper by Yasin Ahmad: +https://sites.google.com/site/analysismethods/yasin-pubs/TopDown-Yasin-ISPASS14.pdf?attredirects=0 +Slides by Yasin Ahmad: +http://www.cs.technion.ac.il/~erangi/TMA_using_Linux_perf__Ahmad_Yasin.pdf +The performance group was originally published here: +http://perf.mvermeulen.com/2018/04/14/top-down-performance-counter-analysis-part-1-likwid/ diff --git a/collectors/likwid/groups/ivybridgeEP/UNCORECLOCK.txt b/collectors/likwid/groups/ivybridgeEP/UNCORECLOCK.txt new file mode 100644 index 0000000..1cc1f98 --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/UNCORECLOCK.txt @@ -0,0 +1,96 @@ +SHORT All Clocks + +EVENTSET +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +CBOX0C0 CBOX_CLOCKTICKS +CBOX1C0 CBOX_CLOCKTICKS +CBOX2C0 CBOX_CLOCKTICKS +CBOX3C0 CBOX_CLOCKTICKS +CBOX4C0 CBOX_CLOCKTICKS +CBOX5C0 CBOX_CLOCKTICKS +CBOX6C0 CBOX_CLOCKTICKS +CBOX7C0 CBOX_CLOCKTICKS +CBOX8C0 CBOX_CLOCKTICKS +CBOX9C0 CBOX_CLOCKTICKS +CBOX10C0 CBOX_CLOCKTICKS +CBOX11C0 CBOX_CLOCKTICKS +CBOX12C0 CBOX_CLOCKTICKS +CBOX13C0 CBOX_CLOCKTICKS +CBOX14C0 CBOX_CLOCKTICKS +MBOX0C0 DRAM_CLOCKTICKS +MBOX1C0 DRAM_CLOCKTICKS +MBOX2C0 DRAM_CLOCKTICKS +MBOX3C0 DRAM_CLOCKTICKS +MBOX0FIX DRAM_CLOCKTICKS +MBOX1FIX DRAM_CLOCKTICKS +MBOX2FIX DRAM_CLOCKTICKS +MBOX3FIX DRAM_CLOCKTICKS +SBOX0C0 SBOX_CLOCKTICKS +SBOX1C0 SBOX_CLOCKTICKS +SBOX2C0 SBOX_CLOCKTICKS +UBOXFIX UNCORE_CLOCK +BBOX0C0 BBOX_CLOCKTICKS +BBOX1C0 BBOX_CLOCKTICKS +WBOX0 WBOX_CLOCKTICKS +PBOX0 PBOX_CLOCKTICKS +RBOX0C0 RBOX_CLOCKTICKS +RBOX1C0 RBOX_CLOCKTICKS +RBOX2C0 RBOX_CLOCKTICKS +IBOX0C0 IBOX_CLOCKTICKS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +UBOX Frequency [GHz] 1.E-09*UBOXFIX/(FIXC1*inverseClock) +CBOX0 Frequency [GHz] 1.E-09*CBOX0C0/(FIXC1*inverseClock) +CBOX1 Frequency [GHz] 1.E-09*CBOX1C0/(FIXC1*inverseClock) +CBOX2 Frequency [GHz] 1.E-09*CBOX2C0/(FIXC1*inverseClock) +CBOX3 Frequency [GHz] 1.E-09*CBOX3C0/(FIXC1*inverseClock) +CBOX4 Frequency [GHz] 1.E-09*CBOX4C0/(FIXC1*inverseClock) +CBOX5 Frequency [GHz] 1.E-09*CBOX5C0/(FIXC1*inverseClock) +CBOX6 Frequency [GHz] 1.E-09*CBOX6C0/(FIXC1*inverseClock) +CBOX7 Frequency [GHz] 1.E-09*CBOX7C0/(FIXC1*inverseClock) +CBOX8 Frequency [GHz] 1.E-09*CBOX8C0/(FIXC1*inverseClock) +CBOX9 Frequency [GHz] 1.E-09*CBOX9C0/(FIXC1*inverseClock) +CBOX10 Frequency [GHz] 1.E-09*CBOX10C0/(FIXC1*inverseClock) +CBOX11 Frequency [GHz] 1.E-09*CBOX11C0/(FIXC1*inverseClock) +CBOX12 Frequency [GHz] 1.E-09*CBOX12C0/(FIXC1*inverseClock) +CBOX13 Frequency [GHz] 1.E-09*CBOX13C0/(FIXC1*inverseClock) +CBOX14 Frequency [GHz] 1.E-09*CBOX14C0/(FIXC1*inverseClock) +MBOX0 Frequency [GHz] 1.E-09*MBOX0C0/(FIXC1*inverseClock) +MBOX0FIX Frequency [GHz] 1.E-09*MBOX0FIX/(FIXC1*inverseClock) +MBOX1 Frequency [GHz] 1.E-09*MBOX1C0/(FIXC1*inverseClock) +MBOX1FIX Frequency [GHz] 1.E-09*MBOX1FIX/(FIXC1*inverseClock) +MBOX2 Frequency [GHz] 1.E-09*MBOX2C0/(FIXC1*inverseClock) +MBOX2FIX Frequency [GHz] 1.E-09*MBOX2FIX/(FIXC1*inverseClock) +MBOX3 Frequency [GHz] 1.E-09*MBOX3C0/(FIXC1*inverseClock) +MBOX3FIX Frequency [GHz] 1.E-09*MBOX3FIX/(FIXC1*inverseClock) +SBOX0 Frequency [GHz] 1.E-09*SBOX0C0/(FIXC1*inverseClock) +SBOX1 Frequency [GHz] 1.E-09*SBOX1C0/(FIXC1*inverseClock) +SBOX2 Frequency [GHz] 1.E-09*SBOX2C0/(FIXC1*inverseClock) +BBOX0 Frequency [GHz] 1.E-09*BBOX0C0/(FIXC1*inverseClock) +BBOX1 Frequency [GHz] 1.E-09*BBOX1C0/(FIXC1*inverseClock) +WBOX Frequency [GHz] 1.E-09*WBOX0/(FIXC1*inverseClock) +PBOX Frequency [GHz] 1.E-09*PBOX0/(FIXC1*inverseClock) +RBOX0 Frequency [GHz] 1.E-09*RBOX0C0/(FIXC1*inverseClock) +RBOX1 Frequency [GHz] 1.E-09*RBOX1C0/(FIXC1*inverseClock) +RBOX2 Frequency [GHz] 1.E-09*RBOX2C0/(FIXC1*inverseClock) +IBOX Frequency [GHz] 1.E-09*IBOX0/(FIXC1*inverseClock) + + +LONG +Formulas: +UBOX Frequency [GHz] = 1.E-09*UNCORE_CLOCK/(CPU_CLK_UNHALTED_CORE*inverseClock) +CBOX[0-14] Frequency [GHz] = 1.E-09*CBOX_CLOCKTICKS/(CPU_CLK_UNHALTED_CORE*inverseClock) +MBOX[0-3] Frequency [GHz] = 1.E-09*DRAM_CLOCKTICKS/(CPU_CLK_UNHALTED_CORE*inverseClock) +MBOX[0-3]FIX Frequency [GHz] = 1.E-09*DRAM_CLOCKTICKS/(CPU_CLK_UNHALTED_CORE*inverseClock) +SBOX[0-2] Frequency [GHz] = 1.E-09*SBOX_CLOCKTICKS/(CPU_CLK_UNHALTED_CORE*inverseClock) +BBOX[0-1] Frequency [GHz] = 1.E-09*BBOX_CLOCKTICKS/(CPU_CLK_UNHALTED_CORE*inverseClock) +RBOX[0-2] Frequency [GHz] = 1.E-09*RBOX_CLOCKTICKS/(CPU_CLK_UNHALTED_CORE*inverseClock) +WBOX Frequency [GHz] = 1.E-09*WBOX_CLOCKTICKS/(CPU_CLK_UNHALTED_CORE*inverseClock) +PBOX Frequency [GHz] = 1.E-09*PBOX_CLOCKTICKS/(CPU_CLK_UNHALTED_CORE*inverseClock) +IBOX Frequency [GHz] = 1.E-09*IBOX_CLOCKTICKS/(CPU_CLK_UNHALTED_CORE*inverseClock) +-- +A Overview over the frequencies of all Uncore units. diff --git a/collectors/likwid/groups/ivybridgeEP/UOPS.txt b/collectors/likwid/groups/ivybridgeEP/UOPS.txt new file mode 100644 index 0000000..e6cc208 --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/UOPS.txt @@ -0,0 +1,35 @@ +SHORT UOPs execution info + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_ANY +PMC1 UOPS_EXECUTED_THREAD +PMC2 UOPS_RETIRED_ALL +PMC3 UOPS_ISSUED_FLAGS_MERGE + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Issued UOPs PMC0 +Merged UOPs PMC3 +Executed UOPs PMC1 +Retired UOPs PMC2 + +LONG +Formulas: +Issued UOPs = UOPS_ISSUED_ANY +Merged UOPs = UOPS_ISSUED_FLAGS_MERGE +Executed UOPs = UOPS_EXECUTED_THREAD +Retired UOPs = UOPS_RETIRED_ALL +- +This group returns information about the instruction pipeline. It measures the +issued, executed and retired uOPs and returns the number of uOPs which were issued +but not executed as well as the number of uOPs which were executed but never retired. +The executed but not retired uOPs commonly come from speculatively executed branches. + diff --git a/collectors/likwid/groups/ivybridgeEP/UOPS_EXEC.txt b/collectors/likwid/groups/ivybridgeEP/UOPS_EXEC.txt new file mode 100644 index 0000000..7042df7 --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/UOPS_EXEC.txt @@ -0,0 +1,31 @@ +SHORT UOPs execution + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_EXECUTED_USED_CYCLES +PMC1 UOPS_EXECUTED_STALL_CYCLES +PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES +PMC3:EDGEDETECT UOPS_EXECUTED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio [%] 100*PMC0/PMC2 +Unused cycles ratio [%] 100*PMC1/PMC2 +Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT + + +LONG +Formulas: +Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT +- +This performance group returns the ratios of used and unused cycles regarding +the execution stage in the pipeline. Used cycles are all cycles where uOPs are +executed while unused cycles refer to pipeline stalls. Moreover, the group +calculates the average stall duration in cycles. diff --git a/collectors/likwid/groups/ivybridgeEP/UOPS_ISSUE.txt b/collectors/likwid/groups/ivybridgeEP/UOPS_ISSUE.txt new file mode 100644 index 0000000..9aac923 --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/UOPS_ISSUE.txt @@ -0,0 +1,31 @@ +SHORT UOPs issueing + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_USED_CYCLES +PMC1 UOPS_ISSUED_STALL_CYCLES +PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES +PMC3:EDGEDETECT UOPS_ISSUED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio [%] 100*PMC0/PMC2 +Unused cycles ratio [%] 100*PMC1/PMC2 +Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT + + +LONG +Formulas: +Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT +- +This performance group returns the ratios of used and unused cycles regarding +the issue stage in the pipeline. Used cycles are all cycles where uOPs are +issued while unused cycles refer to pipeline stalls. Moreover, the group +calculates the average stall duration in cycles. diff --git a/collectors/likwid/groups/ivybridgeEP/UOPS_RETIRE.txt b/collectors/likwid/groups/ivybridgeEP/UOPS_RETIRE.txt new file mode 100644 index 0000000..0f37585 --- /dev/null +++ b/collectors/likwid/groups/ivybridgeEP/UOPS_RETIRE.txt @@ -0,0 +1,31 @@ +SHORT UOPs retirement + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_RETIRED_USED_CYCLES +PMC1 UOPS_RETIRED_STALL_CYCLES +PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES +PMC3:EDGEDETECT UOPS_RETIRED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio [%] 100*PMC0/PMC2 +Unused cycles ratio [%] 100*PMC1/PMC2 +Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT + + +LONG +Formulas: +Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT +- +This performance group returns the ratios of used and unused cycles regarding +the retirement stage in the pipeline (re-order buffer). Used cycles are all +cycles where uOPs are retired while unused cycles refer to pipeline stalls. +Moreover, the group calculates the average stall duration in cycles. diff --git a/collectors/likwid/groups/k10/BRANCH.txt b/collectors/likwid/groups/k10/BRANCH.txt new file mode 100644 index 0000000..5c4207e --- /dev/null +++ b/collectors/likwid/groups/k10/BRANCH.txt @@ -0,0 +1,26 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +PMC0 INSTRUCTIONS_RETIRED +PMC1 BRANCH_RETIRED +PMC2 BRANCH_MISPREDICT_RETIRED + +METRICS +Runtime (RDTSC) [s] time +Branch rate PMC1/PMC0 +Branch misprediction rate PMC2/PMC0 +Branch misprediction ratio PMC2/PMC1 +Instructions per branch PMC0/PMC1 + +LONG +Formulas: +Branch rate = BRANCH_RETIRED/INSTRUCTIONS_RETIRED +Branch misprediction rate = BRANCH_MISPREDICT_RETIRED/INSTRUCTIONS_RETIRED +Branch misprediction ratio = BRANCH_MISPREDICT_RETIRED/BRANCH_RETIRED +Instructions per branch = INSTRUCTIONS_RETIRED/BRANCH_RETIRED +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ration of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/k10/CACHE.txt b/collectors/likwid/groups/k10/CACHE.txt new file mode 100644 index 0000000..26d799f --- /dev/null +++ b/collectors/likwid/groups/k10/CACHE.txt @@ -0,0 +1,34 @@ +SHORT Data cache miss rate/ratio + +EVENTSET +PMC0 INSTRUCTIONS_RETIRED +PMC1 DATA_CACHE_ACCESSES +PMC2 DATA_CACHE_REFILLS_L2_ALL +PMC3 DATA_CACHE_REFILLS_NORTHBRIDGE_ALL + +METRICS +Runtime (RDTSC) [s] time +data cache misses PMC2+PMC3 +data cache request rate PMC1/PMC0 +data cache miss rate (PMC2+PMC3)/PMC0 +data cache miss ratio (PMC2+PMC3)/PMC1 + +LONG +Formulas: +data cache misses = DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL +data cache request rate = DATA_CACHE_ACCESSES / INSTRUCTIONS_RETIRED +data cache miss rate = (DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL)/INSTRUCTIONS_RETIRED +data cache miss ratio = (DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL)/DATA_CACHE_ACCESSES +- +This group measures the locality of your data accesses with regard to the +L1 cache. Data cache request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The data cache miss rate gives a measure how often it was necessary to get +cache lines from higher levels of the memory hierarchy. And finally +data cache miss ratio tells you how many of your memory references required +a cache line to be loaded from a higher level. While the# data cache miss rate +might be given by your algorithm you should try to get data cache miss ratio +as low as possible by increasing your cache reuse. +This group was taken from the whitepaper -Basic Performance Measurements for AMD Athlon 64, +AMD Opteron and AMD Phenom Processors- from Paul J. Drongowski. + diff --git a/collectors/likwid/groups/k10/CPI.txt b/collectors/likwid/groups/k10/CPI.txt new file mode 100644 index 0000000..850afed --- /dev/null +++ b/collectors/likwid/groups/k10/CPI.txt @@ -0,0 +1,26 @@ +SHORT Cycles per instruction + +EVENTSET +PMC0 INSTRUCTIONS_RETIRED +PMC1 CPU_CLOCKS_UNHALTED +PMC2 UOPS_RETIRED + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] PMC1*inverseClock +CPI PMC1/PMC0 +CPI (based on uops) PMC1/PMC2 +IPC PMC0/PMC1 + +LONG +Formulas: +CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS +CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS +IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED +- +This group measures how efficient the processor works with +regard to instruction throughput. Also important as a standalone +metric is INSTRUCTIONS_RETIRED as it tells you how many instruction +you need to execute for a task. An optimization might show very +low CPI values but execute many more instruction for it. + diff --git a/collectors/likwid/groups/k10/FLOPS_DP.txt b/collectors/likwid/groups/k10/FLOPS_DP.txt new file mode 100644 index 0000000..89f0ac2 --- /dev/null +++ b/collectors/likwid/groups/k10/FLOPS_DP.txt @@ -0,0 +1,24 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +PMC0 SSE_RETIRED_ADD_DOUBLE_FLOPS +PMC1 SSE_RETIRED_MULT_DOUBLE_FLOPS +PMC2 CPU_CLOCKS_UNHALTED + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] PMC2*inverseClock +DP [MFLOP/s] 1.0E-06*(PMC0+PMC1)/time +DP Add [MFLOP/s] 1.0E-06*PMC0/time +DP Mult [MFLOP/s] 1.0E-06*PMC1/time + +LONG +Formulas: +DP [MFLOP/s] = 1.0E-06*(SSE_RETIRED_ADD_DOUBLE_FLOPS+SSE_RETIRED_MULT_DOUBLE_FLOPS)/time +DP Add [MFLOP/s] = 1.0E-06*(SSE_RETIRED_ADD_DOUBLE_FLOPS)/time +DP Mult [MFLOP/s] = 1.0E-06*(SSE_RETIRED_MULT_DOUBLE_FLOPS)/time +- +Profiling group to measure double SSE FLOPs. +Don't forget that your code might also execute X87 FLOPs. + + diff --git a/collectors/likwid/groups/k10/FLOPS_SP.txt b/collectors/likwid/groups/k10/FLOPS_SP.txt new file mode 100644 index 0000000..590d39a --- /dev/null +++ b/collectors/likwid/groups/k10/FLOPS_SP.txt @@ -0,0 +1,24 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +PMC0 SSE_RETIRED_ADD_SINGLE_FLOPS +PMC1 SSE_RETIRED_MULT_SINGLE_FLOPS +PMC2 CPU_CLOCKS_UNHALTED + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] PMC2*inverseClock +SP [MFLOP/s] 1.0E-06*(PMC0+PMC1)/time +SP Add [MFLOP/s] 1.0E-06*PMC0/time +SP Mult [MFLOP/s] 1.0E-06*PMC1/time + +LONG +Formulas: +SP [MFLOP/s] = 1.0E-06*(SSE_RETIRED_ADD_SINGLE_FLOPS+SSE_RETIRED_MULT_SINGLE_FLOPS)/time +SP Add [MFLOP/s] = 1.0E-06*(SSE_RETIRED_ADD_SINGLE_FLOPS)/time +SP Mult [MFLOP/s] = 1.0E-06*(SSE_RETIRED_MULT_SINGLE_FLOPS)/time +- +Profiling group to measure single precision SSE FLOPs. +Don't forget that your code might also execute X87 FLOPs. + + diff --git a/collectors/likwid/groups/k10/FLOPS_X87.txt b/collectors/likwid/groups/k10/FLOPS_X87.txt new file mode 100644 index 0000000..62fbefc --- /dev/null +++ b/collectors/likwid/groups/k10/FLOPS_X87.txt @@ -0,0 +1,25 @@ +SHORT X87 MFLOP/s + +EVENTSET +PMC0 X87_FLOPS_RETIRED_ADD +PMC1 X87_FLOPS_RETIRED_MULT +PMC2 X87_FLOPS_RETIRED_DIV +PMC3 CPU_CLOCKS_UNHALTED + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] PMC3*inverseClock +X87 [MFLOP/s] 1.0E-06*(PMC0+PMC1+PMC2)/time +X87 Add [MFLOP/s] 1.0E-06*PMC0/time +X87 Mult [MFLOP/s] 1.0E-06*PMC1/time +X87 Div [MFLOP/s] 1.0E-06*PMC2/time + +LONG +Formulas: +X87 [MFLOP/s] = 1.0E-06*(X87_FLOPS_RETIRED_ADD+X87_FLOPS_RETIRED_MULT+X87_FLOPS_RETIRED_DIV)/time +X87 Add [MFLOP/s] = 1.0E-06*X87_FLOPS_RETIRED_ADD/time +X87 Mult [MFLOP/s] = 1.0E-06*X87_FLOPS_RETIRED_MULT/time +X87 Div [MFLOP/s] = 1.0E-06*X87_FLOPS_RETIRED_DIV/time +- +Profiling group to measure X87 FLOP rates. + diff --git a/collectors/likwid/groups/k10/FPU_EXCEPTION.txt b/collectors/likwid/groups/k10/FPU_EXCEPTION.txt new file mode 100644 index 0000000..23d3c54 --- /dev/null +++ b/collectors/likwid/groups/k10/FPU_EXCEPTION.txt @@ -0,0 +1,21 @@ +SHORT Floating point exceptions + +EVENTSET +PMC0 INSTRUCTIONS_RETIRED +PMC1 FP_INSTRUCTIONS_RETIRED_ALL +PMC2 FPU_EXCEPTIONS_ALL + +METRICS +Runtime (RDTSC) [s] time +Overall FP exception rate PMC2/PMC0 +FP exception rate PMC2/PMC1 + +LONG +Formulas: +Overall FP exception rate = FPU_EXCEPTIONS_ALL / INSTRUCTIONS_RETIRED +FP exception rate = FPU_EXCEPTIONS_ALL / FP_INSTRUCTIONS_RETIRED_ALL +- +Floating point exceptions occur e.g. on the treatment of denormal numbers. +There might be a large penalty if there are too many floating point +exceptions. + diff --git a/collectors/likwid/groups/k10/ICACHE.txt b/collectors/likwid/groups/k10/ICACHE.txt new file mode 100644 index 0000000..5150496 --- /dev/null +++ b/collectors/likwid/groups/k10/ICACHE.txt @@ -0,0 +1,23 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +PMC0 INSTRUCTIONS_RETIRED +PMC1 ICACHE_FETCHES +PMC2 ICACHE_REFILLS_L2 +PMC3 ICACHE_REFILLS_MEM + +METRICS +Runtime (RDTSC) [s] time +L1I request rate PMC1/PMC0 +L1I miss rate (PMC2+PMC3)/PMC0 +L1I miss ratio (PMC2+PMC3)/PMC1 + +LONG +Formulas: +L1I request rate = ICACHE_FETCHES / INSTRUCTIONS_RETIRED +L1I miss rate = (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/INSTRUCTIONS_RETIRED +L1I miss ratio = (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/ICACHE_FETCHES +- +This group measures the locality of your instruction code with regard to the +L1 I-Cache. + diff --git a/collectors/likwid/groups/k10/L2.txt b/collectors/likwid/groups/k10/L2.txt new file mode 100644 index 0000000..fae6fb0 --- /dev/null +++ b/collectors/likwid/groups/k10/L2.txt @@ -0,0 +1,33 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +PMC0 DATA_CACHE_REFILLS_L2_ALL +PMC1 DATA_CACHE_EVICTED_ALL +PMC2 CPU_CLOCKS_UNHALTED + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] PMC2*inverseClock +L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2D load data volume [GBytes] 1.0E-09*PMC0*64.0 +L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 + +LONG +Formulas: +L2D load bandwidth [MBytes/s] = 1.0E-06*DATA_CACHE_REFILLS_L2_ALL*64.0/time +L2D load data volume [GBytes] = 1.0E-09*DATA_CACHE_REFILLS_L2_ALL*64.0 +L2D evict bandwidth [MBytes/s] = 1.0E-06*DATA_CACHE_EVICTED_ALL*64.0/time +L2D evict data volume [GBytes] = 1.0E-09*DATA_CACHE_EVICTED_ALL*64.0 +L2 bandwidth [MBytes/s] = 1.0E-06*(DATA_CACHE_REFILLS_L2_ALL+DATA_CACHE_EVICTED_ALL)*64/time +L2 data volume [GBytes] = 1.0E-09*(DATA_CACHE_REFILLS_L2_ALL+DATA_CACHE_EVICTED_ALL)*64 +- +Profiling group to measure L2 cache bandwidth. The bandwidth is +computed by the number of cache line loaded from L2 to L1 and the +number of modified cache lines evicted from the L1. +Note that this bandwidth also includes data transfers due to a +write allocate load on a store miss in L1 and copy back transfers if +originated from L2. + diff --git a/collectors/likwid/groups/k10/L2CACHE.txt b/collectors/likwid/groups/k10/L2CACHE.txt new file mode 100644 index 0000000..2d29e43 --- /dev/null +++ b/collectors/likwid/groups/k10/L2CACHE.txt @@ -0,0 +1,32 @@ +SHORT L2 cache miss rate/ratio + +EVENTSET +PMC0 INSTRUCTIONS_RETIRED +PMC1 L2_REQUESTS_ALL +PMC2 L2_MISSES_ALL +PMC3 L2_FILL_ALL + +METRICS +Runtime (RDTSC) [s] time +L2 request rate (PMC1+PMC3)/PMC0 +L2 miss rate PMC2/PMC0 +L2 miss ratio PMC2/(PMC1+PMC3) + +LONG +Formulas: +L2 request rate = (L2_REQUESTS_ALL+L2_FILL_ALL)/INSTRUCTIONS_RETIRED +L2 miss rate = L2_MISSES_ALL/INSTRUCTIONS_RETIRED +L2 miss ratio = L2_MISSES_ALL/(L2_REQUESTS_ALL+L2_FILL_ALL) +- +This group measures the locality of your data accesses with regard to the +L2 cache. L2 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L2 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L2 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the# data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. +This group was taken from the whitepaper -Basic Performance Measurements for AMD Athlon 64, +AMD Opteron and AMD Phenom Processors- from Paul J. Drongowski. + + diff --git a/collectors/likwid/groups/k10/MEM.txt b/collectors/likwid/groups/k10/MEM.txt new file mode 100644 index 0000000..f9f5a91 --- /dev/null +++ b/collectors/likwid/groups/k10/MEM.txt @@ -0,0 +1,35 @@ +SHORT Main memory bandwidth in MBytes/s + +EVENTSET +PMC0 NORTHBRIDGE_READ_RESPONSE_ALL +PMC1 OCTWORDS_WRITE_TRANSFERS +PMC2 DRAM_ACCESSES_DCTO_ALL +PMC3 DRAM_ACCESSES_DCT1_ALL + +METRICS +Runtime (RDTSC) [s] time +Memory read bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +Memory read data volume [GBytes] 1.0E-09*PMC0*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*PMC1*8.0/time +Memory write data volume [GBytes] 1.0E-09*PMC1*8.0 +Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time +Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0 + +LONG +Formulas: +Memory read bandwidth [MBytes/s] = 1.0E-06*NORTHBRIDGE_READ_RESPONSE_ALL*64/time +Memory read data volume [GBytes] = 1.0E-09*NORTHBRIDGE_READ_RESPONSE_ALL*64 +Memory write bandwidth [MBytes/s] = 1.0E-06*OCTWORDS_WRITE_TRANSFERS*8/time +Memory write data volume [GBytes] = 1.0E-09*OCTWORDS_WRITE_TRANSFERS*8 +Memory bandwidth [MBytes/s] = 1.0E-06*(DRAM_ACCESSES_DCTO_ALL+DRAM_ACCESSES_DCT1_ALL)*64/time +Memory data volume [GBytes] = 1.0E-09*(DRAM_ACCESSES_DCTO_ALL+DRAM_ACCESSES_DCT1_ALL)*64 +- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Note: As this group measures the accesses from all cores it only makes sense +to measure with one core per socket, similar as with the Intel Nehalem Uncore events. +The memory read bandwidth contains all data from DRAM, L3, or another cache, +including another core on the same node. The event OCTWORDS_WRITE_TRANSFERS counts +16 Byte transfers, not 64 Byte. + + + diff --git a/collectors/likwid/groups/k10/NUMA_0_3.txt b/collectors/likwid/groups/k10/NUMA_0_3.txt new file mode 100644 index 0000000..66e56d9 --- /dev/null +++ b/collectors/likwid/groups/k10/NUMA_0_3.txt @@ -0,0 +1,27 @@ +SHORT Bandwidth on the Hypertransport links + +EVENTSET +PMC0 CPU_TO_DRAM_LOCAL_TO_0 +PMC1 CPU_TO_DRAM_LOCAL_TO_1 +PMC2 CPU_TO_DRAM_LOCAL_TO_2 +PMC3 CPU_TO_DRAM_LOCAL_TO_3 + +METRICS +Runtime (RDTSC) [s] time +Hyper Transport link0 bandwidth [MBytes/s] 1.0E-06*PMC0*4.0/time +Hyper Transport link1 bandwidth [MBytes/s] 1.0E-06*PMC1*4.0/time +Hyper Transport link2 bandwidth [MBytes/s] 1.0E-06*PMC2*4.0/time +Hyper Transport link3 bandwidth [MBytes/s] 1.0E-06*PMC3*4.0/time + +LONG +Formulas: +Hyper Transport link0 bandwidth [MBytes/s] = 1.0E-06*CPU_TO_DRAM_LOCAL_TO_0*4.0/time +Hyper Transport link1 bandwidth [MBytes/s] = 1.0E-06*CPU_TO_DRAM_LOCAL_TO_1*4.0/time +Hyper Transport link2 bandwidth [MBytes/s] = 1.0E-06*CPU_TO_DRAM_LOCAL_TO_2*4.0/time +Hyper Transport link3 bandwidth [MBytes/s] = 1.0E-06*CPU_TO_DRAM_LOCAL_TO_3*4.0/time +- +Profiling group to measure the bandwidth over the Hypertransport links. Can be used +to detect NUMA problems. Usually there should be only limited traffic over the QPI +links for optimal performance. + + diff --git a/collectors/likwid/groups/k10/NUMA_4_7.txt b/collectors/likwid/groups/k10/NUMA_4_7.txt new file mode 100644 index 0000000..e13f2b9 --- /dev/null +++ b/collectors/likwid/groups/k10/NUMA_4_7.txt @@ -0,0 +1,27 @@ +SHORT Bandwidth on the Hypertransport links + +EVENTSET +PMC0 CPU_TO_DRAM_LOCAL_TO_4 +PMC1 CPU_TO_DRAM_LOCAL_TO_5 +PMC2 CPU_TO_DRAM_LOCAL_TO_6 +PMC3 CPU_TO_DRAM_LOCAL_TO_7 + +METRICS +Runtime (RDTSC) [s] time +Hyper Transport link4 bandwidth [MBytes/s] 1.0E-06*PMC0*4.0/time +Hyper Transport link5 bandwidth [MBytes/s] 1.0E-06*PMC1*4.0/time +Hyper Transport link6 bandwidth [MBytes/s] 1.0E-06*PMC2*4.0/time +Hyper Transport link7 bandwidth [MBytes/s] 1.0E-06*PMC3*4.0/time + +LONG +Formulas: +Hyper Transport link4 bandwidth [MBytes/s] = 1.0E-06*CPU_TO_DRAM_LOCAL_TO_0*4.0/time +Hyper Transport link5 bandwidth [MBytes/s] = 1.0E-06*CPU_TO_DRAM_LOCAL_TO_1*4.0/time +Hyper Transport link6 bandwidth [MBytes/s] = 1.0E-06*CPU_TO_DRAM_LOCAL_TO_2*4.0/time +Hyper Transport link7 bandwidth [MBytes/s] = 1.0E-06*CPU_TO_DRAM_LOCAL_TO_3*4.0/time +- +Profiling group to measure the bandwidth over the Hypertransport links. Can be used +to detect NUMA problems. Usually there should be only limited traffic over the QPI +links for optimal performance. + + diff --git a/collectors/likwid/groups/k10/TLB.txt b/collectors/likwid/groups/k10/TLB.txt new file mode 100644 index 0000000..25cab33 --- /dev/null +++ b/collectors/likwid/groups/k10/TLB.txt @@ -0,0 +1,35 @@ +SHORT TLB miss rate/ratio + +EVENTSET +PMC0 INSTRUCTIONS_RETIRED +PMC1 DATA_CACHE_ACCESSES +PMC2 DTLB_L2_HIT_ALL +PMC3 DTLB_L2_MISS_ALL + +METRICS +Runtime (RDTSC) [s] time +L1 DTLB request rate PMC1/PMC0 +L1 DTLB miss rate (PMC2+PMC3)/PMC0 +L1 DTLB miss ratio (PMC2+PMC3)/PMC1 +L2 DTLB request rate (PMC2+PMC3)/PMC0 +L2 DTLB miss rate PMC3/PMC0 +L2 DTLB miss ratio PMC3/(PMC2+PMC3) + + +LONG +Formulas: +L1 DTLB request rate = DATA_CACHE_ACCESSES / INSTRUCTIONS_RETIRED +L1 DTLB miss rate = (DTLB_L2_HIT_ALL+DTLB_L2_MISS_ALL)/INSTRUCTIONS_RETIRED +L1 DTLB miss ratio = (DTLB_L2_HIT_ALL+DTLB_L2_MISS_ALL)/DATA_CACHE_ACCESSES +L2 DTLB request rate = (DTLB_L2_HIT_ALL+DTLB_L2_MISS_ALL)/INSTRUCTIONS_RETIRED +L2 DTLB miss rate = DTLB_L2_MISS_ALL / INSTRUCTIONS_RETIRED +L2 DTLB miss ratio = DTLB_L2_MISS_ALL / (DTLB_L2_HIT_ALL+DTLB_L2_MISS_ALL) +- +L1 DTLB request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The DTLB miss rate gives a measure how often a TLB miss occurred +per instruction. And finally L1 DTLB miss ratio tells you how many +of your memory references required caused a TLB miss on average. +NOTE: The L2 metrics are only relevant if L2 DTLB request rate is equal to the L1 DTLB miss rate! +This group was taken from the whitepaper Basic -Performance Measurements for AMD Athlon 64, +AMD Opteron and AMD Phenom Processors- from Paul J. Drongowski. diff --git a/collectors/likwid/groups/k8/BRANCH.txt b/collectors/likwid/groups/k8/BRANCH.txt new file mode 100644 index 0000000..f465335 --- /dev/null +++ b/collectors/likwid/groups/k8/BRANCH.txt @@ -0,0 +1,25 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +PMC0 INSTRUCTIONS_RETIRED +PMC1 BRANCH_RETIRED +PMC2 BRANCH_MISPREDICT_RETIRED + +METRICS +Runtime (RDTSC) [s] time +Branch rate PMC1/PMC0 +Branch misprediction rate PMC2/PMC0 +Branch misprediction ratio PMC2/PMC1 +Instructions per branch PMC0/PMC1 + +LONG +Formulas: +Branch rate = BRANCH_RETIRED/INSTRUCTIONS_RETIRED +Branch misprediction rate = BRANCH_MISPREDICT_RETIRED/INSTRUCTIONS_RETIRED +Branch misprediction ratio = BRANCH_MISPREDICT_RETIRED/BRANCH_RETIRED +Instructions per branch = INSTRUCTIONS_RETIRED/BRANCH_RETIRED +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ration of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. diff --git a/collectors/likwid/groups/k8/CACHE.txt b/collectors/likwid/groups/k8/CACHE.txt new file mode 100644 index 0000000..e5e813e --- /dev/null +++ b/collectors/likwid/groups/k8/CACHE.txt @@ -0,0 +1,33 @@ +SHORT Data cache miss rate/ratio + +EVENTSET +PMC0 INSTRUCTIONS_RETIRED +PMC1 DATA_CACHE_ACCESSES +PMC2 DATA_CACHE_REFILLS_L2_ALL +PMC3 DATA_CACHE_REFILLS_NORTHBRIDGE_ALL + +METRICS +Runtime (RDTSC) [s] time +data cache misses PMC2+PMC3 +data cache request rate PMC1/PMC0 +data cache miss rate (PMC2+PMC3)/PMC0 +data cache miss ratio (PMC2+PMC3)/PMC1 + +LONG +Formulas: +data cache misses = DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL +data cache request rate = DATA_CACHE_ACCESSES / INSTRUCTIONS_RETIRED +data cache miss rate = (DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL)/INSTRUCTIONS_RETIRED +data cache miss ratio = (DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL)/DATA_CACHE_ACCESSES +- +This group measures the locality of your data accesses with regard to the +L1 cache. Data cache request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The data cache miss rate gives a measure how often it was necessary to get +cache lines from higher levels of the memory hierarchy. And finally +data cache miss ratio tells you how many of your memory references required +a cache line to be loaded from a higher level. While the# data cache miss rate +might be given by your algorithm you should try to get data cache miss ratio +as low as possible by increasing your cache reuse. +This group was taken from the whitepaper -Basic Performance Measurements for AMD Athlon 64, +AMD Opteron and AMD Phenom Processors- from Paul J. Drongowski. diff --git a/collectors/likwid/groups/k8/CPI.txt b/collectors/likwid/groups/k8/CPI.txt new file mode 100644 index 0000000..850afed --- /dev/null +++ b/collectors/likwid/groups/k8/CPI.txt @@ -0,0 +1,26 @@ +SHORT Cycles per instruction + +EVENTSET +PMC0 INSTRUCTIONS_RETIRED +PMC1 CPU_CLOCKS_UNHALTED +PMC2 UOPS_RETIRED + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] PMC1*inverseClock +CPI PMC1/PMC0 +CPI (based on uops) PMC1/PMC2 +IPC PMC0/PMC1 + +LONG +Formulas: +CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS +CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS +IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED +- +This group measures how efficient the processor works with +regard to instruction throughput. Also important as a standalone +metric is INSTRUCTIONS_RETIRED as it tells you how many instruction +you need to execute for a task. An optimization might show very +low CPI values but execute many more instruction for it. + diff --git a/collectors/likwid/groups/k8/ICACHE.txt b/collectors/likwid/groups/k8/ICACHE.txt new file mode 100644 index 0000000..5150496 --- /dev/null +++ b/collectors/likwid/groups/k8/ICACHE.txt @@ -0,0 +1,23 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +PMC0 INSTRUCTIONS_RETIRED +PMC1 ICACHE_FETCHES +PMC2 ICACHE_REFILLS_L2 +PMC3 ICACHE_REFILLS_MEM + +METRICS +Runtime (RDTSC) [s] time +L1I request rate PMC1/PMC0 +L1I miss rate (PMC2+PMC3)/PMC0 +L1I miss ratio (PMC2+PMC3)/PMC1 + +LONG +Formulas: +L1I request rate = ICACHE_FETCHES / INSTRUCTIONS_RETIRED +L1I miss rate = (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/INSTRUCTIONS_RETIRED +L1I miss ratio = (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/ICACHE_FETCHES +- +This group measures the locality of your instruction code with regard to the +L1 I-Cache. + diff --git a/collectors/likwid/groups/k8/L2.txt b/collectors/likwid/groups/k8/L2.txt new file mode 100644 index 0000000..63b9b7f --- /dev/null +++ b/collectors/likwid/groups/k8/L2.txt @@ -0,0 +1,31 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +PMC0 DATA_CACHE_REFILLS_L2_ALL +PMC1 DATA_CACHE_EVICTED_ALL +PMC2 CPU_CLOCKS_UNHALTED + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] PMC2*inverseClock +L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 +L2 refill bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2 evict [MBytes/s] 1.0E-06*PMC1*64.0/time + +LONG +Formulas: +L2 bandwidth [MBytes/s] = 1.0E-06*(DATA_CACHE_REFILLS_L2_ALL+DATA_CACHE_EVICTED_ALL)*64/time +L2 data volume [GBytes] = 1.0E-09*(DATA_CACHE_REFILLS_L2_ALL+DATA_CACHE_EVICTED_ALL)*64 +L2 refill bandwidth [MBytes/s] = 1.0E-06*DATA_CACHE_REFILLS_L2_ALL*64/time +L2 evict [MBytes/s] = 1.0E-06*DATA_CACHE_EVICTED_ALL*64/time +- +Profiling group to measure L2 cache bandwidth. The bandwidth is +computed by the number of cache line loaded from L2 to L1 and the +number of modified cache lines evicted from the L1. +Note that this bandwidth also includes data transfers due to a +write allocate load on a store miss in L1 and copy back transfers if +originated from L2. + + + diff --git a/collectors/likwid/groups/kabini/BRANCH.txt b/collectors/likwid/groups/kabini/BRANCH.txt new file mode 100644 index 0000000..7495b74 --- /dev/null +++ b/collectors/likwid/groups/kabini/BRANCH.txt @@ -0,0 +1,26 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +PMC0 RETIRED_INSTRUCTIONS +PMC1 RETIRED_BRANCH_INSTR +PMC2 RETIRED_MISPREDICTED_BRANCH_INSTR + +METRICS +Runtime (RDTSC) [s] time +Branch rate PMC1/PMC0 +Branch misprediction rate PMC2/PMC0 +Branch misprediction ratio PMC2/PMC1 +Instructions per branch PMC0/PMC1 + +LONG +Formulas: +Branch rate = RETIRED_BRANCH_INSTR/RETIRED_INSTRUCTIONS +Branch misprediction rate = RETIRED_MISPREDICTED_BRANCH_INSTR/RETIRED_INSTRUCTIONS +Branch misprediction ratio = RETIRED_MISPREDICTED_BRANCH_INSTR/RETIRED_BRANCH_INSTR +Instructions per branch = RETIRED_INSTRUCTIONS/RETIRED_BRANCH_INSTR +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/kabini/CACHE.txt b/collectors/likwid/groups/kabini/CACHE.txt new file mode 100644 index 0000000..8a59288 --- /dev/null +++ b/collectors/likwid/groups/kabini/CACHE.txt @@ -0,0 +1,32 @@ +SHORT Data cache miss rate/ratio + +EVENTSET +PMC0 RETIRED_INSTRUCTIONS +PMC1 DATA_CACHE_ACCESSES +PMC2 DATA_CACHE_REFILLS_ALL +PMC3 DATA_CACHE_REFILLS_NB_ALL + +METRICS +Runtime (RDTSC) [s] time +data cache misses PMC2+PMC3 +data cache request rate PMC1/PMC0 +data cache miss rate (PMC2+PMC3)/PMC0 +data cache miss ratio (PMC2+PMC3)/PMC1 + +LONG +Formulas: +data cache misses = DATA_CACHE_REFILLS_ALL + DATA_CACHE_REFILLS_NB_ALL +data cache request rate = DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS +data cache miss rate = (DATA_CACHE_REFILLS_ALL + DATA_CACHE_REFILLS_NB_ALL)/RETIRED_INSTRUCTIONS +data cache miss ratio = (DATA_CACHE_REFILLS_ALL + DATA_CACHE_REFILLS_NB_ALL)/DATA_CACHE_ACCESSES +- +This group measures the locality of your data accesses with regard to the +L1 cache. Data cache request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The data cache miss rate gives a measure how often it was necessary to get +cache lines from higher levels of the memory hierarchy. And finally +data cache miss ratio tells you how many of your memory references required +a cache line to be loaded from a higher level. While the# data cache miss rate +might be given by your algorithm you should try to get data cache miss ratio +as low as possible by increasing your cache reuse. + diff --git a/collectors/likwid/groups/kabini/CPI.txt b/collectors/likwid/groups/kabini/CPI.txt new file mode 100644 index 0000000..c0746e7 --- /dev/null +++ b/collectors/likwid/groups/kabini/CPI.txt @@ -0,0 +1,26 @@ +SHORT Cycles per instruction + +EVENTSET +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +PMC2 RETIRED_UOPS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] PMC1*inverseClock +CPI PMC1/PMC0 +CPI (based on uops) PMC1/PMC2 +IPC PMC0/PMC1 + +LONG +Formulas: +CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS +CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS +IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED +- +This group measures how efficient the processor works with +regard to instruction throughput. Also important as a standalone +metric is RETIRED_INSTRUCTIONS as it tells you how many instruction +you need to execute for a task. An optimization might show very +low CPI values but execute many more instruction for it. + diff --git a/collectors/likwid/groups/kabini/DATA.txt b/collectors/likwid/groups/kabini/DATA.txt new file mode 100644 index 0000000..75f1f60 --- /dev/null +++ b/collectors/likwid/groups/kabini/DATA.txt @@ -0,0 +1,16 @@ +SHORT Load to store ratio + +EVENTSET +PMC0 LS_DISPATCH_LOADS +PMC1 LS_DISPATCH_STORES + +METRICS +Runtime (RDTSC) [s] time +Load to store ratio PMC0/PMC1 + +LONG +Formulas: +Load to store ratio = LS_DISPATCH_LOADS/LS_DISPATCH_STORES +- +This is a simple metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/kabini/FLOPS_DP.txt b/collectors/likwid/groups/kabini/FLOPS_DP.txt new file mode 100644 index 0000000..1a4e54c --- /dev/null +++ b/collectors/likwid/groups/kabini/FLOPS_DP.txt @@ -0,0 +1,26 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +PMC2 RETIRED_UOPS +PMC3 RETIRED_FLOPS_DOUBLE_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] PMC1*inverseClock +DP [MFLOP/s] 1.0E-06*(PMC3)/time +CPI PMC1/PMC0 +CPI (based on uops) PMC1/PMC2 +IPC PMC0/PMC1 + +LONG +Formulas: +DP [MFLOP/s] = 1.0E-06*(RETIRED_FLOPS_DOUBLE_ALL)/time +CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS +CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS +IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED +- +Profiling group to measure double precisision FLOP rate. + + diff --git a/collectors/likwid/groups/kabini/FLOPS_SP.txt b/collectors/likwid/groups/kabini/FLOPS_SP.txt new file mode 100644 index 0000000..f6c08c1 --- /dev/null +++ b/collectors/likwid/groups/kabini/FLOPS_SP.txt @@ -0,0 +1,26 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +PMC2 RETIRED_UOPS +PMC3 RETIRED_FLOPS_SINGLE_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] PMC1*inverseClock +SP [MFLOP/s] 1.0E-06*(PMC3)/time +CPI PMC1/PMC0 +CPI (based on uops) PMC1/PMC2 +IPC PMC0/PMC1 + +LONG +Formulas: +SP [MFLOP/s] = 1.0E-06*(RETIRED_FLOPS_SINGLE_ALL)/time +CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS +CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS +IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED +- +Profiling group to measure single precision FLOP rate. + + diff --git a/collectors/likwid/groups/kabini/FPU_EXCEPTION.txt b/collectors/likwid/groups/kabini/FPU_EXCEPTION.txt new file mode 100644 index 0000000..5ed02c6 --- /dev/null +++ b/collectors/likwid/groups/kabini/FPU_EXCEPTION.txt @@ -0,0 +1,21 @@ +SHORT Floating point exceptions + +EVENTSET +PMC0 RETIRED_INSTRUCTIONS +PMC1 RETIRED_FP_INSTRUCTIONS_ALL +PMC2 FPU_EXCEPTION_ALL + +METRICS +Runtime (RDTSC) [s] time +Overall FP exception rate PMC2/PMC0 +FP exception rate PMC2/PMC1 + +LONG +Formulas: +Overall FP exception rate = FPU_EXCEPTIONS_ALL / RETIRED_INSTRUCTIONS +FP exception rate = FPU_EXCEPTIONS_ALL / FP_INSTRUCTIONS_RETIRED_ALL +- +Floating point exceptions occur e.g. on the treatment of denormal numbers. +There might be a large penalty if there are too many floating point +exceptions. + diff --git a/collectors/likwid/groups/kabini/ICACHE.txt b/collectors/likwid/groups/kabini/ICACHE.txt new file mode 100644 index 0000000..62b91d6 --- /dev/null +++ b/collectors/likwid/groups/kabini/ICACHE.txt @@ -0,0 +1,23 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +PMC0 INSTRUCTION_CACHE_FETCHES +PMC1 INSTRUCTION_CACHE_L2_REFILLS +PMC2 INSTRUCTION_CACHE_SYSTEM_REFILLS +PMC3 RETIRED_INSTRUCTIONS + +METRICS +Runtime (RDTSC) [s] time +L1I request rate PMC0/PMC3 +L1I miss rate (PMC1+PMC2)/PMC3 +L1I miss ratio (PMC1+PMC2)/PMC0 + +LONG +Formulas: +L1I request rate = INSTRUCTION_CACHE_FETCHES / RETIRED_INSTRUCTIONS +L1I miss rate = (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/RETIRED_INSTRUCTIONS +L1I miss ratio = (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/INSTRUCTION_CACHE_FETCHES +- +This group measures the locality of your instruction code with regard to the +L1 I-Cache. + diff --git a/collectors/likwid/groups/kabini/L2.txt b/collectors/likwid/groups/kabini/L2.txt new file mode 100644 index 0000000..3598a54 --- /dev/null +++ b/collectors/likwid/groups/kabini/L2.txt @@ -0,0 +1,33 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +PMC0 DATA_CACHE_REFILLS_ALL +PMC1 DATA_CACHE_EVICTED_ALL +PMC2 CPU_CLOCKS_UNHALTED + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] PMC2*inverseClock +L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2D load data volume [GBytes] 1.0E-09*PMC0*64.0 +L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 + +LONG +Formulas: +L2D load bandwidth [MBytes/s] = 1.0E-06*DATA_CACHE_REFILLS_ALL*64.0/time +L2D load data volume [GBytes] = 1.0E-09*DATA_CACHE_REFILLS_ALL*64.0 +L2D evict bandwidth [MBytes/s] = 1.0E-06*DATA_CACHE_EVICTED_ALL*64.0/time +L2D evict data volume [GBytes] = 1.0E-09*DATA_CACHE_EVICTED_ALL*64.0 +L2 bandwidth [MBytes/s] = 1.0E-06*(DATA_CACHE_REFILLS_ALL+DATA_CACHE_EVICTED_ALL)*64/time +L2 data volume [GBytes] = 1.0E-09*(DATA_CACHE_REFILLS_ALL+DATA_CACHE_EVICTED_ALL)*64 +- +Profiling group to measure L2 cache bandwidth. The bandwidth is +computed by the number of cache line loaded from L2 to L1 and the +number of modified cache lines evicted from the L1. +Note that this bandwidth also includes data transfers due to a +write allocate load on a store miss in L1 and copy back transfers if +originated from L2. + diff --git a/collectors/likwid/groups/kabini/MEM.txt b/collectors/likwid/groups/kabini/MEM.txt new file mode 100644 index 0000000..2fa9dfe --- /dev/null +++ b/collectors/likwid/groups/kabini/MEM.txt @@ -0,0 +1,20 @@ +SHORT Main memory bandwidth in MBytes/s + +EVENTSET +UPMC0 UNC_DRAM_ACCESSES_DCT0_ALL +UPMC1 UNC_DRAM_ACCESSES_DCT1_ALL + +METRICS +Runtime (RDTSC) [s] time +Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64.0 + +LONG +Formulas: +Memory bandwidth [MBytes/s] = 1.0E-06*(DRAM_ACCESSES_DCTO_ALL+DRAM_ACCESSES_DCT1_ALL)*64/time +Memory data volume [GBytes] = 1.0E-09*(DRAM_ACCESSES_DCTO_ALL+DRAM_ACCESSES_DCT1_ALL)*64 +- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Note: As this group measures the accesses from all cores it only makes sense +to measure with one core per socket, similar as with the Intel Nehalem Uncore events. + diff --git a/collectors/likwid/groups/kabini/NUMA_0_3.txt b/collectors/likwid/groups/kabini/NUMA_0_3.txt new file mode 100644 index 0000000..79f3618 --- /dev/null +++ b/collectors/likwid/groups/kabini/NUMA_0_3.txt @@ -0,0 +1,28 @@ +SHORT Read/Write Events between the ccNUMA nodes + +EVENTSET +UPMC0 UNC_CPU_TO_DRAM_LOCAL_TO_0 +UPMC1 UNC_CPU_TO_DRAM_LOCAL_TO_1 +UPMC2 UNC_CPU_TO_DRAM_LOCAL_TO_2 +UPMC3 UNC_CPU_TO_DRAM_LOCAL_TO_3 + +METRICS +Runtime (RDTSC) [s] time +DRAM read/write local to 0 [MegaEvents/s] 1.0E-06*UPMC0/time +DRAM read/write local to 1 [MegaEvents/s] 1.0E-06*UPMC1/time +DRAM read/write local to 2 [MegaEvents/s] 1.0E-06*UPMC2/time +DRAM read/write local to 3 [MegaEvents/s] 1.0E-06*UPMC3/time + +LONG +Formulas: +DRAM read/write local to 0 [MegaEvents/s] = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_0/time +DRAM read/write local to 1 [MegaEvents/s] = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_1/time +DRAM read/write local to 2 [MegaEvents/s] = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_2/time +DRAM read/write local to 3 [MegaEvents/s] = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_3/time +- +Profiling group to measure the traffic from local CPU to the different +DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded +code. You must first determine on which memory domains your code is running. +A code should only have significant traffic to its own memory domain. + + diff --git a/collectors/likwid/groups/kabini/NUMA_4_7.txt b/collectors/likwid/groups/kabini/NUMA_4_7.txt new file mode 100644 index 0000000..7b518db --- /dev/null +++ b/collectors/likwid/groups/kabini/NUMA_4_7.txt @@ -0,0 +1,28 @@ +SHORT Read/Write Events between the ccNUMA nodes + +EVENTSET +UPMC0 UNC_CPU_TO_DRAM_LOCAL_TO_4 +UPMC1 UNC_CPU_TO_DRAM_LOCAL_TO_5 +UPMC2 UNC_CPU_TO_DRAM_LOCAL_TO_6 +UPMC3 UNC_CPU_TO_DRAM_LOCAL_TO_7 + +METRICS +Runtime (RDTSC) [s] time +DRAM read/write local to 4 [MegaEvents/s] 1.0E-06*UPMC0/time +DRAM read/write local to 5 [MegaEvents/s] 1.0E-06*UPMC1/time +DRAM read/write local to 6 [MegaEvents/s] 1.0E-06*UPMC2/time +DRAM read/write local to 7 [MegaEvents/s] 1.0E-06*UPMC3/time + +LONG +Formulas: +DRAM read/write local to 4 [MegaEvents/s] = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_4/time +DRAM read/write local to 5 [MegaEvents/s] = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_5/time +DRAM read/write local to 6 [MegaEvents/s] = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_6/time +DRAM read/write local to 7 [MegaEvents/s] = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_7/time +- +Profiling group to measure the traffic from local CPU to the different +DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded +code. You must first determine on which memory domains your code is running. +A code should only have significant traffic to its own memory domain. + + diff --git a/collectors/likwid/groups/kabini/TLB.txt b/collectors/likwid/groups/kabini/TLB.txt new file mode 100644 index 0000000..f66b3cb --- /dev/null +++ b/collectors/likwid/groups/kabini/TLB.txt @@ -0,0 +1,34 @@ +SHORT TLB miss rate/ratio + +EVENTSET +PMC0 RETIRED_INSTRUCTIONS +PMC1 DATA_CACHE_ACCESSES +PMC2 L2_DTLB_HIT_ALL +PMC3 DTLB_MISS_ALL + +METRICS +Runtime (RDTSC) [s] time +L1 DTLB request rate PMC1/PMC0 +L1 DTLB miss rate (PMC2+PMC3)/PMC0 +L1 DTLB miss ratio (PMC2+PMC3)/PMC1 +L2 DTLB request rate (PMC2+PMC3)/PMC0 +L2 DTLB miss rate PMC3/PMC0 +L2 DTLB miss ratio PMC3/(PMC2+PMC3) + + +LONG +Formulas: +L1 DTLB request rate = DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS +L1 DTLB miss rate = (L2_DTLB_HIT_ALL+DTLB_MISS_ALL)/RETIRED_INSTRUCTIONS +L1 DTLB miss ratio = (L2_DTLB_HIT_ALL+DTLB_MISS_ALL)/DATA_CACHE_ACCESSES +L2 DTLB request rate = (L2_DTLB_HIT_ALL+DTLB_MISS_ALL)/RETIRED_INSTRUCTIONS +L2 DTLB miss rate = DTLB_MISS_ALL / RETIRED_INSTRUCTIONS +L2 DTLB miss ratio = DTLB_MISS_ALL / (L2_DTLB_HIT_ALL+DTLB_MISS_ALL) +- +L1 DTLB request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The DTLB miss rate gives a measure how often a TLB miss occurred +per instruction. And finally L1 DTLB miss ratio tells you how many +of your memory references required caused a TLB miss on average. +NOTE: The L2 metrics are only relevant if L2 DTLB request rate is +equal to the L1 DTLB miss rate! diff --git a/collectors/likwid/groups/knl/BRANCH.txt b/collectors/likwid/groups/knl/BRANCH.txt new file mode 100644 index 0000000..b8d41b2 --- /dev/null +++ b/collectors/likwid/groups/knl/BRANCH.txt @@ -0,0 +1,31 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 BR_INST_RETIRED_ALL_BRANCHES +PMC1 BR_MISP_RETIRED_ALL_BRANCHES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Branch rate PMC0/FIXC0 +Branch misprediction rate PMC1/FIXC0 +Branch misprediction ratio PMC1/PMC0 +Instructions per branch FIXC0/PMC0 + +LONG +Formulas: +Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES +Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/knl/CLOCK.txt b/collectors/likwid/groups/knl/CLOCK.txt new file mode 100644 index 0000000..8756ef2 --- /dev/null +++ b/collectors/likwid/groups/knl/CLOCK.txt @@ -0,0 +1,23 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +- +The Xeon Phi (Knights Landing) implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) level. + diff --git a/collectors/likwid/groups/knl/DATA.txt b/collectors/likwid/groups/knl/DATA.txt new file mode 100644 index 0000000..61a915b --- /dev/null +++ b/collectors/likwid/groups/knl/DATA.txt @@ -0,0 +1,22 @@ +SHORT Load to store ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_UOPS_RETIRED_ALL_LOADS +PMC1 MEM_UOPS_RETIRED_ALL_STORES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Load to store ratio PMC0/PMC1 + +LONG +Formulas: +Load to store ratio = MEM_UOPS_RETIRED_ALL_LOADS/MEM_UOPS_RETIRED_ALL_STORES +- +This is a metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/knl/DIVIDE.txt b/collectors/likwid/groups/knl/DIVIDE.txt new file mode 100644 index 0000000..d9b0918 --- /dev/null +++ b/collectors/likwid/groups/knl/DIVIDE.txt @@ -0,0 +1,24 @@ +SHORT Divide unit information + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLES_DIV_BUSY_COUNT +PMC1 CYCLES_DIV_BUSY + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Number of divide ops PMC0 +Avg. divide unit usage duration PMC1/PMC0 + +LONG +Formulas: +Number of divide ops = CYCLES_DIV_BUSY_COUNT +Avg. divide unit usage duration = CYCLES_DIV_BUSY/CYCLES_DIV_BUSY_COUNT +- +This performance group measures the average latency of divide operations diff --git a/collectors/likwid/groups/knl/ENERGY.txt b/collectors/likwid/groups/knl/ENERGY.txt new file mode 100644 index 0000000..19ede75 --- /dev/null +++ b/collectors/likwid/groups/knl/ENERGY.txt @@ -0,0 +1,33 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +TMP0 TEMP_CORE +PWR0 PWR_PKG_ENERGY +PWR1 PWR_PP0_ENERGY +PWR3 PWR_DRAM_ENERGY + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Temperature [C] TMP0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy PP0 [J] PWR1 +Power PP0 [W] PWR1/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Power PP0 = PWR_PP0_ENERGY / time +Power DRAM = PWR_DRAM_ENERGY / time +- +Knights Landing implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) level. + diff --git a/collectors/likwid/groups/knl/FLOPS_DP.txt b/collectors/likwid/groups/knl/FLOPS_DP.txt new file mode 100644 index 0000000..88bffe2 --- /dev/null +++ b/collectors/likwid/groups/knl/FLOPS_DP.txt @@ -0,0 +1,34 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_RETIRED_SCALAR_SIMD +PMC1 UOPS_RETIRED_PACKED_SIMD + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +DP [MFLOP/s] (SSE assumed) 1.0E-06*((PMC1*2.0)+PMC0)/time +DP [MFLOP/s] (AVX assumed) 1.0E-06*((PMC1*4.0)+PMC0)/time +DP [MFLOP/s] (AVX512 assumed) 1.0E-06*((PMC1*8.0)+PMC0)/time +Packed [MUOPS/s] 1.0E-06*(PMC1)/time +Scalar [MUOPS/s] 1.0E-06*PMC0/time + +LONG +Formulas: +DP [MFLOP/s] (SSE assumed) = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD*2+UOPS_RETIRED_SCALAR_SIMD)/runtime +DP [MFLOP/s] (AVX assumed) = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD*4+UOPS_RETIRED_SCALAR_SIMD)/runtime +DP [MFLOP/s] (AVX512 assumed) = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD*8+UOPS_RETIRED_SCALAR_SIMD)/runtime +Packed [MUOPS/s] = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD)/runtime +Scalar [MUOPS/s] = 1.0E-06*UOPS_RETIRED_SCALAR_SIMD/runtime +- +AVX/SSE scalar and packed double precision FLOP rates. The Xeon Phi (Knights Landing) provides +no possibility to differentiate between double and single precision FLOP/s. Therefore, we only +assume that the printed [MFLOP/s] value is for double-precision code. Moreover, there is no way +to distinguish between SSE, AVX or AVX512 packed SIMD operations. Therefore, this group prints +out the [MFLOP/s] for different SIMD techniques. +WARNING: The events also count for integer arithmetics diff --git a/collectors/likwid/groups/knl/FLOPS_SP.txt b/collectors/likwid/groups/knl/FLOPS_SP.txt new file mode 100644 index 0000000..4a28116 --- /dev/null +++ b/collectors/likwid/groups/knl/FLOPS_SP.txt @@ -0,0 +1,34 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_RETIRED_SCALAR_SIMD +PMC1 UOPS_RETIRED_PACKED_SIMD + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +SP [MFLOP/s] (SSE assumed) 1.0E-06*(PMC1*4.0+PMC0)/time +SP [MFLOP/s] (AVX assumed) 1.0E-06*(PMC1*8.0+PMC0)/time +SP [MFLOP/s] (AVX512 assumed) 1.0E-06*(PMC1*16.0+PMC0)/time +Packed [MUOPS/s] 1.0E-06*(PMC1)/time +Scalar [MUOPS/s] 1.0E-06*PMC0/time + +LONG +Formulas: +SP [MFLOP/s] (SSE assumed) = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD*4+UOPS_RETIRED_SCALAR_SIMD)/runtime +SP [MFLOP/s] (AVX assumed) = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD*8+UOPS_RETIRED_SCALAR_SIMD)/runtime +SP [MFLOP/s] (AVX512 assumed) = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD*16+UOPS_RETIRED_SCALAR_SIMD)/runtime +Packed [MUOPS/s] = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD)/runtime +Scalar [MUOPS/s] = 1.0E-06*UOPS_RETIRED_SCALAR_SIMD/runtime +- +AVX/SSE scalar and packed single precision FLOP rates. The Xeon Phi (Knights Landing) provides +no possibility to differentiate between double and single precision FLOP/s. Therefore, we only +assume that the printed MFLOP/s value is for single-precision code. Moreover, there is no way +to distinguish between SSE, AVX or AVX512 packed SIMD operations. Therefore, this group prints +out the MFLOP/s for different SIMD techniques. +WARNING: The events also count for integer arithmetics diff --git a/collectors/likwid/groups/knl/FRONTEND_STALLS.txt b/collectors/likwid/groups/knl/FRONTEND_STALLS.txt new file mode 100644 index 0000000..1b9f98e --- /dev/null +++ b/collectors/likwid/groups/knl/FRONTEND_STALLS.txt @@ -0,0 +1,25 @@ +SHORT Frontend stalls + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 NO_ALLOC_CYCLES_ALL +PMC1 NO_ALLOC_CYCLES_ALL_COUNT + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Frontend stalls PMC1 +Avg. frontend stall duration [cyc] PMC0/PMC1 +Frontend stall ratio PMC0/FIXC1 + +LONG +Formulas: +Frontend stalls = NO_ALLOC_CYCLES_ALL +Avg. frontend stall duration [cyc] = NO_ALLOC_CYCLES_ALL/NO_ALLOC_CYCLES_ALL_COUNT +Frontend stall ratio = NO_ALLOC_CYCLES_ALL/CPU_CLK_UNHALTED_CORE +- +Frontend stalls diff --git a/collectors/likwid/groups/knl/HBM.txt b/collectors/likwid/groups/knl/HBM.txt new file mode 100644 index 0000000..ac44418 --- /dev/null +++ b/collectors/likwid/groups/knl/HBM.txt @@ -0,0 +1,46 @@ +SHORT Memory bandwidth in MBytes/s for High Bandwidth Memory (HBM) + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +EDBOX0C0 EDC_RPQ_INSERTS +EDBOX1C0 EDC_RPQ_INSERTS +EDBOX2C0 EDC_RPQ_INSERTS +EDBOX3C0 EDC_RPQ_INSERTS +EDBOX4C0 EDC_RPQ_INSERTS +EDBOX5C0 EDC_RPQ_INSERTS +EDBOX6C0 EDC_RPQ_INSERTS +EDBOX7C0 EDC_RPQ_INSERTS +EDBOX0C1 EDC_WPQ_INSERTS +EDBOX1C1 EDC_WPQ_INSERTS +EDBOX2C1 EDC_WPQ_INSERTS +EDBOX3C1 EDC_WPQ_INSERTS +EDBOX4C1 EDC_WPQ_INSERTS +EDBOX5C1 EDC_WPQ_INSERTS +EDBOX6C1 EDC_WPQ_INSERTS +EDBOX7C1 EDC_WPQ_INSERTS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Memory read bandwidth [MBytes/s] 1.0E-06*(EDBOX0C0+EDBOX1C0+EDBOX2C0+EDBOX3C0+EDBOX4C0+EDBOX5C0+EDBOX6C0+EDBOX7C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(EDBOX0C0+EDBOX1C0+EDBOX2C0+EDBOX3C0+EDBOX4C0+EDBOX5C0+EDBOX6C0+EDBOX7C0)*64.0 +Memory writeback bandwidth [MBytes/s] 1.0E-06*(EDBOX0C1+EDBOX1C1+EDBOX2C1+EDBOX3C1+EDBOX4C1+EDBOX5C1+EDBOX6C1+EDBOX7C1)*64.0/time +Memory writeback data volume [GBytes] 1.0E-09*(EDBOX0C1+EDBOX1C1+EDBOX2C1+EDBOX3C1+EDBOX4C1+EDBOX5C1+EDBOX6C1+EDBOX7C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(EDBOX0C0+EDBOX1C0+EDBOX2C0+EDBOX3C0+EDBOX4C0+EDBOX5C0+EDBOX6C0+EDBOX7C0+EDBOX0C1+EDBOX1C1+EDBOX2C1+EDBOX3C1+EDBOX4C1+EDBOX5C1+EDBOX6C1+EDBOX7C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(EDBOX0C0+EDBOX1C0+EDBOX2C0+EDBOX3C0+EDBOX4C0+EDBOX5C0+EDBOX6C0+EDBOX7C0+EDBOX0C1+EDBOX1C1+EDBOX2C1+EDBOX3C1+EDBOX4C1+EDBOX5C1+EDBOX6C1+EDBOX7C1)*64.0 + +LONG +Formulas: +Memory read bandwidth [MBytes/s] = 1.0E-06*(sum(EDC_RPQ_INSERTS))*64/time +Memory read data volume [GBytes] = 1.0E-09*(sum(EDC_RPQ_INSERTS))*64 +Memory writeback bandwidth [MBytes/s] = 1.0E-06*(sum(EDC_WPQ_INSERTS))*64/time +Memory writeback data volume [GBytes] = 1.0E-09*(sum(EDC_WPQ_INSERTS))*64 +Memory bandwidth [MBytes/s] = 1.0E-06*(sum(EDC_RPQ_INSERTS)+sum(EDC_WPQ_INSERTS))*64/time +Memory data volume [GBytes] = 1.0E-09*(sum(EDC_RPQ_INSERTS)+sum(EDC_WPQ_INSERTS))*64 +- +Profiling group to measure data transfers from and to the high bandwidth memory (HBM). + diff --git a/collectors/likwid/groups/knl/HBM_CACHE.txt b/collectors/likwid/groups/knl/HBM_CACHE.txt new file mode 100644 index 0000000..f89af5d --- /dev/null +++ b/collectors/likwid/groups/knl/HBM_CACHE.txt @@ -0,0 +1,87 @@ +SHORT Memory bandwidth in MBytes/s for High Bandwidth Memory (HBM) + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +EDBOX0C0 EDC_RPQ_INSERTS +EDBOX1C0 EDC_RPQ_INSERTS +EDBOX2C0 EDC_RPQ_INSERTS +EDBOX3C0 EDC_RPQ_INSERTS +EDBOX4C0 EDC_RPQ_INSERTS +EDBOX5C0 EDC_RPQ_INSERTS +EDBOX6C0 EDC_RPQ_INSERTS +EDBOX7C0 EDC_RPQ_INSERTS +EDBOX0C1 EDC_WPQ_INSERTS +EDBOX1C1 EDC_WPQ_INSERTS +EDBOX2C1 EDC_WPQ_INSERTS +EDBOX3C1 EDC_WPQ_INSERTS +EDBOX4C1 EDC_WPQ_INSERTS +EDBOX5C1 EDC_WPQ_INSERTS +EDBOX6C1 EDC_WPQ_INSERTS +EDBOX7C1 EDC_WPQ_INSERTS +EUBOX0C0 EDC_MISS_CLEAN +EUBOX1C0 EDC_MISS_CLEAN +EUBOX2C0 EDC_MISS_CLEAN +EUBOX3C0 EDC_MISS_CLEAN +EUBOX4C0 EDC_MISS_CLEAN +EUBOX5C0 EDC_MISS_CLEAN +EUBOX6C0 EDC_MISS_CLEAN +EUBOX7C0 EDC_MISS_CLEAN +EUBOX0C1 EDC_MISS_DIRTY +EUBOX1C1 EDC_MISS_DIRTY +EUBOX2C1 EDC_MISS_DIRTY +EUBOX3C1 EDC_MISS_DIRTY +EUBOX4C1 EDC_MISS_DIRTY +EUBOX5C1 EDC_MISS_DIRTY +EUBOX6C1 EDC_MISS_DIRTY +EUBOX7C1 EDC_MISS_DIRTY +MBOX0C0 MC_CAS_READS +MBOX0C1 MC_CAS_WRITES +MBOX1C0 MC_CAS_READS +MBOX1C1 MC_CAS_WRITES +MBOX2C0 MC_CAS_READS +MBOX2C1 MC_CAS_WRITES +MBOX4C0 MC_CAS_READS +MBOX4C1 MC_CAS_WRITES +MBOX5C0 MC_CAS_READS +MBOX5C1 MC_CAS_WRITES +MBOX6C0 MC_CAS_READS +MBOX6C1 MC_CAS_WRITES + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +MCDRAM Memory read bandwidth [MBytes/s] 1.0E-06*((EDBOX0C0+EDBOX1C0+EDBOX2C0+EDBOX3C0+EDBOX4C0+EDBOX5C0+EDBOX6C0+EDBOX7C0)-(EUBOX0C0+EUBOX1C0+EUBOX2C0+EUBOX3C0+EUBOX4C0+EUBOX5C0+EUBOX6C0+EUBOX7C0)-(EUBOX0C1+EUBOX1C1+EUBOX2C1+EUBOX3C1+EUBOX4C1+EUBOX5C1+EUBOX6C1+EUBOX7C1))*64.0/time +MCDRAM Memory read data volume [GBytes] 1.0E-09*((EDBOX0C0+EDBOX1C0+EDBOX2C0+EDBOX3C0+EDBOX4C0+EDBOX5C0+EDBOX6C0+EDBOX7C0)-(EUBOX0C0+EUBOX1C0+EUBOX2C0+EUBOX3C0+EUBOX4C0+EUBOX5C0+EUBOX6C0+EUBOX7C0)-(EUBOX0C1+EUBOX1C1+EUBOX2C1+EUBOX3C1+EUBOX4C1+EUBOX5C1+EUBOX6C1+EUBOX7C1))*64.0 +MCDRAM Memory writeback bandwidth [MBytes/s] 1.0E-06*((EDBOX0C1+EDBOX1C1+EDBOX2C1+EDBOX3C1+EDBOX4C1+EDBOX5C1+EDBOX6C1+EDBOX7C1)-(MBOX0C0+MBOX1C0+MBOX2C0+MBOX4C0+MBOX5C0+MBOX6C0))*64.0/time +MCDRAM Memory writeback data volume [GBytes] 1.0E-09*((EDBOX0C1+EDBOX1C1+EDBOX2C1+EDBOX3C1+EDBOX4C1+EDBOX5C1+EDBOX6C1+EDBOX7C1)-(MBOX0C0+MBOX1C0+MBOX2C0+MBOX4C0+MBOX5C0+MBOX6C0))*64.0 +MCDRAM Memory bandwidth [MBytes/s] 1.0E-06*(((EDBOX0C0+EDBOX1C0+EDBOX2C0+EDBOX3C0+EDBOX4C0+EDBOX5C0+EDBOX6C0+EDBOX7C0)-(EUBOX0C0+EUBOX1C0+EUBOX2C0+EUBOX3C0+EUBOX4C0+EUBOX5C0+EUBOX6C0+EUBOX7C0)-(EUBOX0C1+EUBOX1C1+EUBOX2C1+EUBOX3C1+EUBOX4C1+EUBOX5C1+EUBOX6C1+EUBOX7C1))+((EDBOX0C1+EDBOX1C1+EDBOX2C1+EDBOX3C1+EDBOX4C1+EDBOX5C1+EDBOX6C1+EDBOX7C1)-(MBOX0C0+MBOX1C0+MBOX2C0+MBOX4C0+MBOX5C0+MBOX6C0)))*64.0/time +MCDRAM Memory data volume [GBytes] 1.0E-09*(((EDBOX0C0+EDBOX1C0+EDBOX2C0+EDBOX3C0+EDBOX4C0+EDBOX5C0+EDBOX6C0+EDBOX7C0)-(EUBOX0C0+EUBOX1C0+EUBOX2C0+EUBOX3C0+EUBOX4C0+EUBOX5C0+EUBOX6C0+EUBOX7C0)-(EUBOX0C1+EUBOX1C1+EUBOX2C1+EUBOX3C1+EUBOX4C1+EUBOX5C1+EUBOX6C1+EUBOX7C1))+((EDBOX0C1+EDBOX1C1+EDBOX2C1+EDBOX3C1+EDBOX4C1+EDBOX5C1+EDBOX6C1+EDBOX7C1)-(MBOX0C0+MBOX1C0+MBOX2C0+MBOX4C0+MBOX5C0+MBOX6C0)))*64.0 +DDR Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX4C0+MBOX5C0+MBOX6C0)*64.0/time +DDR Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX4C0+MBOX5C0+MBOX6C0)*64.0 +DDR Memory writeback bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX4C1+MBOX5C1+MBOX6C1)*64.0/time +DDR Memory writeback data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX4C1+MBOX5C1+MBOX6C1)*64.0 +DDR Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX4C1+MBOX5C1+MBOX6C1)*64.0/time +DDR Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX4C1+MBOX5C1+MBOX6C1)*64.0 + + +LONG +Formulas: +MCDRAM Memory read bandwidth [MBytes/s] = 1.0E-06*(sum(EDC_RPQ_INSERTS))*64/time +MCDRAM Memory read data volume [GBytes] = 1.0E-09*(sum(EDC_RPQ_INSERTS))*64 +MCDRAM Memory writeback bandwidth [MBytes/s] = 1.0E-06*(sum(EDC_WPQ_INSERTS))*64/time +MCDRAM Memory writeback data volume [GBytes] = 1.0E-09*(sum(EDC_WPQ_INSERTS))*64 +MCDRAM Memory bandwidth [MBytes/s] = 1.0E-06*(sum(EDC_RPQ_INSERTS)+sum(EDC_WPQ_INSERTS))*64/time +MCDRAM Memory data volume [GBytes] = 1.0E-09*(sum(EDC_RPQ_INSERTS)+sum(EDC_WPQ_INSERTS))*64 +DDR Memory read bandwidth [MBytes/s] = 1.0E-06*(sum(MC_CAS_READS))*64/time +DDR Memory read data volume [GBytes] = 1.0E-09*(sum(MC_CAS_READS))*64 +DDR Memory writeback bandwidth [MBytes/s] = 1.0E-06*(sum(MC_CAS_WRITES))*64/time +DDR Memory writeback data volume [GBytes] = 1.0E-09*(sum(MC_CAS_WRITES))*64 +DDR Memory bandwidth [MBytes/s] = 1.0E-06*(sum(MC_CAS_READS)+sum(MC_CAS_WRITES))*64/time +DDR Memory data volume [GBytes] = 1.0E-09*(sum(MC_CAS_READS)+sum(MC_CAS_WRITES))*64 +- +Profiling group to measure data transfers from and to the high bandwidth memory (HBM). diff --git a/collectors/likwid/groups/knl/HBM_OFFCORE.txt b/collectors/likwid/groups/knl/HBM_OFFCORE.txt new file mode 100644 index 0000000..626d08f --- /dev/null +++ b/collectors/likwid/groups/knl/HBM_OFFCORE.txt @@ -0,0 +1,32 @@ +SHORT Memory bandwidth in MBytes/s for High Bandwidth Memory (HBM) + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0:MATCH0=0x4908:MATCH1=0x3F8060 OFFCORE_RESPONSE_0_OPTIONS +PMC1:MATCH0=0x32F7:MATCH1=0x3F8060 OFFCORE_RESPONSE_1_OPTIONS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Memory read bandwidth [MBytes/s] 1.0E-06*(PMC1)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(PMC1)*64.0 +Memory writeback bandwidth [MBytes/s] 1.0E-06*(PMC0)*64.0/time +Memory writeback data volume [GBytes] 1.0E-09*(PMC0)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 + +LONG +Formulas: +Memory read bandwidth [MBytes/s] = 1.0E-06*(sum(OFFCORE_RESPONSE_1_OPTIONS:MATCH0=0x32F7:MATCH1=0x3F8060))*64/time +Memory read data volume [GBytes] = 1.0E-09*(sum(OFFCORE_RESPONSE_1_OPTIONS:MATCH0=0x32F7:MATCH1=0x3F8060))*64 +Memory writeback bandwidth [MBytes/s] = 1.0E-06*(sum(OFFCORE_RESPONSE_0_OPTIONS:MATCH0=0x4908:MATCH1=0x3F8060))*64/time +Memory writeback data volume [GBytes] = 1.0E-09*(sum(OFFCORE_RESPONSE_0_OPTIONS:MATCH0=0x4908:MATCH1=0x3F8060))*64 +Memory bandwidth [MBytes/s] = 1.0E-06*(sum(OFFCORE_RESPONSE_1_OPTIONS:MATCH0=0x32F7:MATCH1=0x3F8060)+sum(OFFCORE_RESPONSE_0_OPTIONS:MATCH0=0x4908:MATCH1=0x3F8060))*64/time +Memory data volume [GBytes] = 1.0E-09*(sum(OFFCORE_RESPONSE_1_OPTIONS:MATCH0=0x32F7:MATCH1=0x3F8060)+sum(OFFCORE_RESPONSE_0_OPTIONS:MATCH0=0x4908:MATCH1=0x3F8060))*64 +- +Profiling group to measure data transfers from and to the high bandwidth memory (HBM). +If possible, use the HBM or HBM_CACHE group because they provide more accurate counts. diff --git a/collectors/likwid/groups/knl/ICACHE.txt b/collectors/likwid/groups/knl/ICACHE.txt new file mode 100644 index 0000000..5f11ad6 --- /dev/null +++ b/collectors/likwid/groups/knl/ICACHE.txt @@ -0,0 +1,25 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ICACHE_ACCESSES +PMC1 ICACHE_MISSES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1I request rate PMC0/FIXC0 +L1I miss rate PMC1/FIXC0 +L1I miss ratio PMC1/PMC0 + +LONG +Formulas: +L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY +L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY +L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES +- +This group measures some L1 instruction cache metrics. diff --git a/collectors/likwid/groups/knl/L2.txt b/collectors/likwid/groups/knl/L2.txt new file mode 100644 index 0000000..4a9370c --- /dev/null +++ b/collectors/likwid/groups/knl/L2.txt @@ -0,0 +1,36 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_REQUESTS_REFERENCE +PMC1:MATCH0=0x0002:MATCH1=0x1 OFFCORE_RESPONSE_0_OPTIONS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 non-RFO bandwidth [MBytes/s] 1.E-06*(PMC0)*64.0/time +L2 non-RFO data volume [GByte] 1.E-09*PMC0*64.0 +L2 RFO bandwidth [MBytes/s] 1.E-06*(PMC1)*64.0/time +L2 RFO data volume [GByte] 1.E-09*(PMC1)*64.0 +L2 bandwidth [MBytes/s] 1.E-06*(PMC0+PMC1)*64.0/time +L2 data volume [GByte] 1.E-09*(PMC0+PMC1)*64.0 + +LONG +Formulas: +L2 non-RFO bandwidth [MBytes/s] = 1.E-06*L2_REQUESTS_REFERENCE*64.0/time +L2 non-RFO data volume [GByte] = 1.E-09*L2_REQUESTS_REFERENCE*64.0 +L2 RFO bandwidth [MBytes/s] = 1.E-06*(OFFCORE_RESPONSE_0_OPTIONS:MATCH0=0x0002:MATCH1=0x1)*64.0/time +L2 RFO data volume [GByte] = 1.E-09*(OFFCORE_RESPONSE_0_OPTIONS:MATCH0=0x0002:MATCH1=0x1)*64.0 +L2 bandwidth [MBytes/s] = 1.E-06*(L2_REQUESTS_REFERENCE+OFFCORE_RESPONSE_0_OPTIONS:MATCH0=0x0002:MATCH1=0x1)*64.0/time +L2 data volume [GByte] = 1.E-09*(L2_REQUESTS_REFERENCE+OFFCORE_RESPONSE_0_OPTIONS:MATCH0=0x0002:MATCH1=0x1)*64.0 +-- +The L2 bandwidth and data volume does not contain RFOs (also called +write-allocates). The RFO bandwidth and data volume is only accurate when all +used data fits in the L2 cache. As soon as the data exceeds the L2 cache size, +the RFO metrics are too high. +Moreover, with increasing count of measured cores, the non-RFO metrics overcount +but commonly stay withing 10% error. diff --git a/collectors/likwid/groups/knl/L2CACHE.txt b/collectors/likwid/groups/knl/L2CACHE.txt new file mode 100644 index 0000000..e6de92f --- /dev/null +++ b/collectors/likwid/groups/knl/L2CACHE.txt @@ -0,0 +1,34 @@ +SHORT L2 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_UOPS_RETIRED_L2_HIT_LOADS +PMC1 MEM_UOPS_RETIRED_L2_MISS_LOADS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 request rate (PMC0+PMC1)/FIXC0 +L2 miss rate PMC1/FIXC0 +L2 miss ratio PMC1/(PMC0+PMC1) + +LONG +Formulas: +L2 request rate = (MEM_UOPS_RETIRED_L2_HIT_LOADS+MEM_UOPS_RETIRED_L2_MISS_LOADS)/INSTR_RETIRED_ANY +L2 miss rate = MEM_UOPS_RETIRED_L2_MISS_LOADS/INSTR_RETIRED_ANY +L2 miss ratio = MEM_UOPS_RETIRED_L2_MISS_LOADS/(MEM_UOPS_RETIRED_L2_HIT_LOADS+MEM_UOPS_RETIRED_L2_MISS_LOADS) +- +This group measures the locality of your data accesses with regard to the +L2 cache. L2 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L2 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L2 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache +reuse. + diff --git a/collectors/likwid/groups/knl/MEM.txt b/collectors/likwid/groups/knl/MEM.txt new file mode 100644 index 0000000..0e53431 --- /dev/null +++ b/collectors/likwid/groups/knl/MEM.txt @@ -0,0 +1,47 @@ +SHORT Memory bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +MBOX0C0 MC_CAS_READS +MBOX0C1 MC_CAS_WRITES +MBOX1C0 MC_CAS_READS +MBOX1C1 MC_CAS_WRITES +MBOX2C0 MC_CAS_READS +MBOX2C1 MC_CAS_WRITES +MBOX4C0 MC_CAS_READS +MBOX4C1 MC_CAS_WRITES +MBOX5C0 MC_CAS_READS +MBOX5C1 MC_CAS_WRITES +MBOX6C0 MC_CAS_READS +MBOX6C1 MC_CAS_WRITES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX4C0+MBOX5C0+MBOX6C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX4C0+MBOX5C0+MBOX6C0)*64.0 +Memory writeback bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX4C1+MBOX5C1+MBOX6C1)*64.0/time +Memory writeback data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX4C1+MBOX5C1+MBOX6C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX4C1+MBOX5C1+MBOX6C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX4C1+MBOX5C1+MBOX6C1)*64.0 + +LONG +Formulas: +Memory read bandwidth [MBytes/s] = 1.0E-06*(sum(MC_CAS_READS))*64/time +Memory read data volume [GBytes] = 1.0E-09*(sum(MC_CAS_READS))*64 +Memory writeback bandwidth [MBytes/s] = 1.0E-06*(sum(MC_CAS_WRITES))*64/time +Memory writeback data volume [GBytes] = 1.0E-09*(sum(MC_CAS_WRITES))*64 +Memory bandwidth [MBytes/s] = 1.0E-06*(sum(MC_CAS_READS)+sum(MC_CAS_WRITES))*64/time +Memory data volume [GBytes] = 1.0E-09*(sum(MC_CAS_READS)+sum(MC_CAS_WRITES))*64 +- +Profiling group to measure L2 to MEM load cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L2 cache. Since there is no possibility to retrieve +the evicted cache lines, this group measures only the load cache bandwidth. The +writeback metrics count only modified cache lines that are written back to go to +exclusive state +The group also output totally load and writeback data volume transferred between memory and L2. + diff --git a/collectors/likwid/groups/knl/TLB_DATA.txt b/collectors/likwid/groups/knl/TLB_DATA.txt new file mode 100644 index 0000000..5f2617f --- /dev/null +++ b/collectors/likwid/groups/knl/TLB_DATA.txt @@ -0,0 +1,27 @@ +SHORT L2 data TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 PAGE_WALKS_DTLB_COUNT +PMC1 PAGE_WALKS_DTLB_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 DTLB misses PMC0 +L1 DTLB miss rate PMC0/FIXC0 +L1 DTLB miss duration [Cyc] PMC1/PMC0 + +LONG +Formulas: +L1 DTLB misses = PAGE_WALKS_DTLB_COUNT +L1 DTLB miss rate = PAGE_WALKS_DTLB_COUNT / INSTR_RETIRED_ANY +L1 DTLB miss duration [Cyc] = PAGE_WALKS_DTLB_CYCLES / PAGE_WALKS_DTLB_COUNT +- +The DTLB load and store miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/knl/TLB_INSTR.txt b/collectors/likwid/groups/knl/TLB_INSTR.txt new file mode 100644 index 0000000..f3dd3ec --- /dev/null +++ b/collectors/likwid/groups/knl/TLB_INSTR.txt @@ -0,0 +1,27 @@ +SHORT L1 Instruction TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 PAGE_WALKS_ITLB_COUNT +PMC1 PAGE_WALKS_ITLB_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 ITLB misses PMC0 +L1 ITLB miss rate PMC0/FIXC0 +L1 ITLB miss duration [Cyc] PMC1/PMC0 + + +LONG +Formulas: +L1 ITLB misses = PAGE_WALKS_ITLB_COUNT +L1 ITLB miss rate = PAGE_WALKS_ITLB_COUNT / INSTR_RETIRED_ANY +L1 ITLB miss duration [Cyc] = PAGE_WALKS_ITLB_CYCLES / PAGE_WALKS_ITLB_COUNT +- +The ITLB miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. diff --git a/collectors/likwid/groups/knl/UOPS_STALLS.txt b/collectors/likwid/groups/knl/UOPS_STALLS.txt new file mode 100644 index 0000000..97cfa99 --- /dev/null +++ b/collectors/likwid/groups/knl/UOPS_STALLS.txt @@ -0,0 +1,25 @@ +SHORT UOP retirement stalls + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_RETIRED_STALLED_CYCLES +PMC1 UOPS_RETIRED_STALLS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Number of stalls PMC1 +Avg. stall duration [cyc] PMC0/PMC1 +Stall ratio PMC0/FIXC1 + +LONG +Formulas: +Number of stalls = UOPS_RETIRED_STALLS +Avg. stall duration [cyc] = UOPS_RETIRED_STALLED_CYCLES/UOPS_RETIRED_STALLS +Stall ratio = UOPS_RETIRED_STALLED_CYCLES/CPU_CLK_UNHALTED_CORE +- +This group measures stalls in the UOP retirement. diff --git a/collectors/likwid/groups/nehalem/BRANCH.txt b/collectors/likwid/groups/nehalem/BRANCH.txt new file mode 100644 index 0000000..1ef9f11 --- /dev/null +++ b/collectors/likwid/groups/nehalem/BRANCH.txt @@ -0,0 +1,31 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 BR_INST_RETIRED_ALL_BRANCHES +PMC1 BR_MISP_RETIRED_ALL_BRANCHES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Branch rate PMC0/FIXC0 +Branch misprediction rate PMC1/FIXC0 +Branch misprediction ratio PMC1/PMC0 +Instructions per branch FIXC0/PMC0 + +LONG +Formulas: +Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES +Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ration of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/nehalem/CACHE.txt b/collectors/likwid/groups/nehalem/CACHE.txt new file mode 100644 index 0000000..6603171 --- /dev/null +++ b/collectors/likwid/groups/nehalem/CACHE.txt @@ -0,0 +1,36 @@ +SHORT Data cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPL +PMC1 L1D_ALL_REF_ANY + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +data cache misses PMC0 +data cache request rate PMC1/FIXC0 +data cache miss rate PMC0/FIXC0 +data cache miss ratio PMC0/PMC1 + +LONG +Formulas: +data cache misses = L1D_REPL +data cache request rate = L1D_ALL_REF_ANY / INSTR_RETIRED_ANY +data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY +data cache miss ratio = L1D_REPL / L1D_ALL_REF_ANY +- +This group measures the locality of your data accesses with regard to the +L1 cache. Data cache request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The data cache miss rate gives a measure how often it was necessary to get +cache lines from higher levels of the memory hierarchy. And finally +data cache miss ratio tells you how many of your memory references required +a cache line to be loaded from a higher level. While the data cache miss rate +might be given by your algorithm you should try to get data cache miss ratio +as low as possible by increasing your cache reuse. + diff --git a/collectors/likwid/groups/nehalem/DATA.txt b/collectors/likwid/groups/nehalem/DATA.txt new file mode 100644 index 0000000..31bba51 --- /dev/null +++ b/collectors/likwid/groups/nehalem/DATA.txt @@ -0,0 +1,22 @@ +SHORT Load to store ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_INST_RETIRED_LOADS +PMC1 MEM_INST_RETIRED_STORES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Load to store ratio PMC0/PMC1 + +LONG +Formulas: +Load to store ratio = MEM_INST_RETIRED_LOADS/MEM_INST_RETIRED_STORES +- +This is a simple metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/nehalem/DIVIDE.txt b/collectors/likwid/groups/nehalem/DIVIDE.txt new file mode 100644 index 0000000..6c17295 --- /dev/null +++ b/collectors/likwid/groups/nehalem/DIVIDE.txt @@ -0,0 +1,24 @@ +SHORT Divide unit information + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ARITH_NUM_DIV +PMC1 ARITH_CYCLES_DIV_BUSY + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Number of divide ops PMC0 +Avg. divide unit usage duration PMC1/PMC0 + +LONG +Formulas: +Number of divide ops = ARITH_NUM_DIV +Avg. divide unit usage duration = ARITH_CYCLES_DIV_BUSY/ARITH_NUM_DIV +- +This performance group measures the average latency of divide operations diff --git a/collectors/likwid/groups/nehalem/FLOPS_DP.txt b/collectors/likwid/groups/nehalem/FLOPS_DP.txt new file mode 100644 index 0000000..0c2e56c --- /dev/null +++ b/collectors/likwid/groups/nehalem/FLOPS_DP.txt @@ -0,0 +1,35 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED +PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR +PMC2 FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION +PMC3 FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1)/time +Packed [MUOPS/s] 1.0E-06*PMC0/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +SP [MUOPS/s] 1.0E-06*PMC2/time +DP [MUOPS/s] 1.0E-06*PMC3/time + +LONG +Formulas: +DP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime +Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime +SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime +DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime +- +The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done. +Therefore both single as well as double precision are measured to ensure the correctness +of the measurements. You can check if your code was vectorized on the number of +FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR. + diff --git a/collectors/likwid/groups/nehalem/FLOPS_SP.txt b/collectors/likwid/groups/nehalem/FLOPS_SP.txt new file mode 100644 index 0000000..8046cbd --- /dev/null +++ b/collectors/likwid/groups/nehalem/FLOPS_SP.txt @@ -0,0 +1,35 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED +PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR +PMC2 FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION +PMC3 FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1)/time +Packed [MUOPS/s] 1.0E-06*PMC0/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +SP [MUOPS/s] 1.0E-06*PMC2/time +DP [MUOPS/s] 1.0E-06*PMC3/time + +LONG +Formulas: +SP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime +Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime +SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime +DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime +- +The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done. +Therefore both single as well as double precision are measured to ensure the correctness +of the measurements. You can check if your code was vectorized on the number of +FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR. + diff --git a/collectors/likwid/groups/nehalem/FLOPS_X87.txt b/collectors/likwid/groups/nehalem/FLOPS_X87.txt new file mode 100644 index 0000000..39cd8b4 --- /dev/null +++ b/collectors/likwid/groups/nehalem/FLOPS_X87.txt @@ -0,0 +1,21 @@ +SHORT X87 MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 INST_RETIRED_X87 + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +X87 [MFLOP/s] 1.0E-06*PMC0/time + +LONG +Formulas: +X87 [MFLOP/s] = 1.0E-06*INST_RETIRED_X87/runtime +- +Profiling group to measure X87 FLOP rate. + diff --git a/collectors/likwid/groups/nehalem/ICACHE.txt b/collectors/likwid/groups/nehalem/ICACHE.txt new file mode 100644 index 0000000..49943ff --- /dev/null +++ b/collectors/likwid/groups/nehalem/ICACHE.txt @@ -0,0 +1,25 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1I_READS +PMC1 L1I_MISSES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1I request rate PMC0/FIXC0 +L1I miss rate PMC1/FIXC0 +L1I miss ratio PMC1/PMC0 + +LONG +Formulas: +L1I request rate = L1I_READS / INSTR_RETIRED_ANY +L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY +L1I miss ratio = ICACHE_MISSES / L1I_READS +- +This group measures some L1 instruction cache metrics. diff --git a/collectors/likwid/groups/nehalem/L2.txt b/collectors/likwid/groups/nehalem/L2.txt new file mode 100644 index 0000000..e2715cc --- /dev/null +++ b/collectors/likwid/groups/nehalem/L2.txt @@ -0,0 +1,40 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPL +PMC1 L1D_M_EVICT +PMC2 L1I_MISSES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2D load data volume [GBytes] 1.0E-09*PMC0*64.0 +L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time +L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0 + +LONG +Formulas: +L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPL*64.0/time +L2D load data volume [GBytes] = 1.0E-09*L1D_REPL*64.0 +L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time +L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0 +L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64/time +L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64 +- +Profiling group to measure L2 cache bandwidth. The bandwidth is +computed by the number of cache line allocated in the L1 and the +number of modified cache lines evicted from the L1. Also reports on +total data volume transferred between L2 and L1 cache. +Note that this bandwidth also includes data transfers due to a +write allocate load on a store miss in L1 and traffic caused by misses in the +L1 instruction cache. + + diff --git a/collectors/likwid/groups/nehalem/L2CACHE.txt b/collectors/likwid/groups/nehalem/L2CACHE.txt new file mode 100644 index 0000000..343b263 --- /dev/null +++ b/collectors/likwid/groups/nehalem/L2CACHE.txt @@ -0,0 +1,34 @@ +SHORT L2 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_RQSTS_REFERENCES +PMC1 L2_RQSTS_MISS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 request rate PMC0/FIXC0 +L2 miss rate PMC1/FIXC0 +L2 miss ratio PMC1/PMC0 + +LONG +Formulas: +L2 request rate = L2_RQSTS_REFERENCES/INSTR_RETIRED_ANY +L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY +L2 miss ratio = L2_RQSTS_MISS/L2_RQSTS_REFERENCES +- +This group measures the locality of your data accesses with regard to the +L2 cache. L2 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L2 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L2 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/nehalem/L3.txt b/collectors/likwid/groups/nehalem/L3.txt new file mode 100644 index 0000000..70b5f29 --- /dev/null +++ b/collectors/likwid/groups/nehalem/L3.txt @@ -0,0 +1,36 @@ +SHORT L3 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_LINES_IN_ANY +PMC1 L2_LINES_OUT_DEMAND_DIRTY + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L3 load data volume [GBytes] 1.0E-09*PMC0*64.0 +L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 + +LONG +Formulas: +L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ANY*64.0/time +L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ANY*64.0 +L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64.0/time +L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DEMAND_DIRTY*64.0 +L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64/time +L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64 +- +Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L2 and the number of modified cache lines +evicted from the L2. Also reports total data volume between L3 and L2 caches. +Note that this bandwidth also includes data transfers due to a write allocate +load on a store miss in L2. + diff --git a/collectors/likwid/groups/nehalem/L3CACHE.txt b/collectors/likwid/groups/nehalem/L3CACHE.txt new file mode 100644 index 0000000..15e00ed --- /dev/null +++ b/collectors/likwid/groups/nehalem/L3CACHE.txt @@ -0,0 +1,34 @@ +SHORT L3 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +UPMC0 UNC_L3_HITS_ANY +UPMC1 UNC_L3_MISS_ANY + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 request rate (UPMC0+UPMC1)/FIXC0 +L3 miss rate UPMC1/FIXC0 +L3 miss ratio UPMC1/(UPMC0+UPMC1) + +LONG +Formulas: +L3 request rate = (UNC_L3_HITS_ANY+UNC_L3_MISS_ANY)/INSTR_RETIRED_ANY +L3 miss rate = UNC_L3_MISS_ANY/INSTR_RETIRED_ANY +L3 miss ratio = UNC_L3_MISS_ANY/(UNC_L3_HITS_ANY+UNC_L3_MISS_ANY) +- +This group measures the locality of your data accesses with regard to the +L3 cache. L3 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L3 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L3 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/nehalem/MEM.txt b/collectors/likwid/groups/nehalem/MEM.txt new file mode 100644 index 0000000..b528670 --- /dev/null +++ b/collectors/likwid/groups/nehalem/MEM.txt @@ -0,0 +1,49 @@ +SHORT Main memory bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +UPMC0 UNC_QMC_NORMAL_READS_ANY +UPMC1 UNC_QMC_WRITES_FULL_ANY +UPMC2 UNC_QHL_REQUESTS_REMOTE_READS +UPMC3 UNC_QHL_REQUESTS_REMOTE_WRITES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Memory read bandwidth [MBytes/s] 1.0E-06*UPMC0*64.0/time +Memory read data volume [GBytes] 1.0E-09*UPMC0*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*UPMC1*64.0/time +Memory write data volume [GBytes] 1.0E-09*UPMC1*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64.0 +Remote memory read bandwidth [MBytes/s] 1.0E-06*UPMC2*64.0/time +Remote memory read data volume [GBytes] 1.0E-09*UPMC2*64.0 +Remote memory write bandwidth [MBytes/s] 1.0E-06*UPMC3*64.0/time +Remote memory write data volume [GBytes] 1.0E-09*UPMC3*64.0 +Remote memory bandwidth [MBytes/s] 1.0E-06*(UPMC2+UPMC3)*64.0/time +Remote memory data volume [GBytes] 1.0E-09*(UPMC2+UPMC3)*64.0 + +LONG +Formulas: +Memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_NORMAL_READS_ANY*64.0/time +Memory read data volume [GBytes] = 1.0E-09*UNC_QMC_NORMAL_READS_ANY*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_WRITES_FULL_ANY*64.0/time +Memory write data volume [GBytes] = 1.0E-09*UNC_QMC_WRITES_FULL_ANY*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0/time +Memory data volume [GBytes] = 1.0E-09*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0 +Remote memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_READS*64.0/time +Remote memory read data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_READS*64.0 +Remote memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0/time +Remote memory write data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0 +Remote memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0/time +Remote memory data volume [GBytes] = 1.0E-09*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0 +- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +This group will be measured by one core per socket. The Remote Read BW tells +you if cache lines are transferred between sockets, meaning that cores access +data owned by a remote NUMA domain. + diff --git a/collectors/likwid/groups/nehalem/SCHEDULER.txt b/collectors/likwid/groups/nehalem/SCHEDULER.txt new file mode 100644 index 0000000..0e43cce --- /dev/null +++ b/collectors/likwid/groups/nehalem/SCHEDULER.txt @@ -0,0 +1,25 @@ +SHORT Scheduler Ports + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_EXECUTED_PORT0 +PMC1 UOPS_EXECUTED_PORT1 +PMC2 UOPS_EXECUTED_PORT5 + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Ratio Port 1 PMC1/PMC0 +Ratio Port 5 PMC2/PMC0 + +LONG +Formulas: +Ratio Port 1 = UOPS_EXECUTED_PORT1/UOPS_EXECUTED_PORT0 +Ratio Port 5 = UOPS_EXECUTED_PORT5/UOPS_EXECUTED_PORT0 +- +Measures how many instructions were scheduled on which issue port. + diff --git a/collectors/likwid/groups/nehalem/TLB.txt b/collectors/likwid/groups/nehalem/TLB.txt new file mode 100644 index 0000000..c380851 --- /dev/null +++ b/collectors/likwid/groups/nehalem/TLB.txt @@ -0,0 +1,30 @@ +SHORT TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 DTLB_MISSES_ANY +PMC1 L1D_ALL_REF_ANY + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 DTLB request rate PMC1/FIXC0 +L1 DTLB miss rate PMC0/FIXC0 +L1 DTLB miss ratio PMC0/PMC1 + +LONG +Formulas: +L1 DTLB request rate = L1D_ALL_REF_ANY / INSTR_RETIRED_ANY +DTLB miss rate = DTLB_MISSES_ANY / INSTR_RETIRED_ANY +L1 DTLB miss ratio = DTLB_MISSES_ANY / L1D_ALL_REF_ANY +- +L1 DTLB request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The DTLB miss rate gives a measure how often a TLB miss occurred +per instruction. And finally L1 DTLB miss ratio tells you how many +of your memory references required caused a TLB miss on average. + diff --git a/collectors/likwid/groups/nehalemEX/BRANCH.txt b/collectors/likwid/groups/nehalemEX/BRANCH.txt new file mode 100644 index 0000000..1ef9f11 --- /dev/null +++ b/collectors/likwid/groups/nehalemEX/BRANCH.txt @@ -0,0 +1,31 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 BR_INST_RETIRED_ALL_BRANCHES +PMC1 BR_MISP_RETIRED_ALL_BRANCHES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Branch rate PMC0/FIXC0 +Branch misprediction rate PMC1/FIXC0 +Branch misprediction ratio PMC1/PMC0 +Instructions per branch FIXC0/PMC0 + +LONG +Formulas: +Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES +Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ration of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/nehalemEX/CACHE.txt b/collectors/likwid/groups/nehalemEX/CACHE.txt new file mode 100644 index 0000000..6603171 --- /dev/null +++ b/collectors/likwid/groups/nehalemEX/CACHE.txt @@ -0,0 +1,36 @@ +SHORT Data cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPL +PMC1 L1D_ALL_REF_ANY + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +data cache misses PMC0 +data cache request rate PMC1/FIXC0 +data cache miss rate PMC0/FIXC0 +data cache miss ratio PMC0/PMC1 + +LONG +Formulas: +data cache misses = L1D_REPL +data cache request rate = L1D_ALL_REF_ANY / INSTR_RETIRED_ANY +data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY +data cache miss ratio = L1D_REPL / L1D_ALL_REF_ANY +- +This group measures the locality of your data accesses with regard to the +L1 cache. Data cache request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The data cache miss rate gives a measure how often it was necessary to get +cache lines from higher levels of the memory hierarchy. And finally +data cache miss ratio tells you how many of your memory references required +a cache line to be loaded from a higher level. While the data cache miss rate +might be given by your algorithm you should try to get data cache miss ratio +as low as possible by increasing your cache reuse. + diff --git a/collectors/likwid/groups/nehalemEX/DATA.txt b/collectors/likwid/groups/nehalemEX/DATA.txt new file mode 100644 index 0000000..31bba51 --- /dev/null +++ b/collectors/likwid/groups/nehalemEX/DATA.txt @@ -0,0 +1,22 @@ +SHORT Load to store ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_INST_RETIRED_LOADS +PMC1 MEM_INST_RETIRED_STORES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Load to store ratio PMC0/PMC1 + +LONG +Formulas: +Load to store ratio = MEM_INST_RETIRED_LOADS/MEM_INST_RETIRED_STORES +- +This is a simple metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/nehalemEX/DIVIDE.txt b/collectors/likwid/groups/nehalemEX/DIVIDE.txt new file mode 100644 index 0000000..cb15563 --- /dev/null +++ b/collectors/likwid/groups/nehalemEX/DIVIDE.txt @@ -0,0 +1,24 @@ +SHORT Divide unit information + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0:EDGEDETECT ARITH_CYCLES_DIV_BUSY +PMC1 ARITH_CYCLES_DIV_BUSY + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Number of divide ops PMC0:EDGEDETECT +Avg. divide unit usage duration PMC1/PMC0:EDGEDETECT + +LONG +Formulas: +Number of divide ops = ARITH_CYCLES_DIV_BUSY:EDGEDETECT +Avg. divide unit usage duration = ARITH_CYCLES_DIV_BUSY/ARITH_CYCLES_DIV_BUSY:EDGEDETECT +- +This performance group measures the average latency of divide operations diff --git a/collectors/likwid/groups/nehalemEX/FLOPS_DP.txt b/collectors/likwid/groups/nehalemEX/FLOPS_DP.txt new file mode 100644 index 0000000..0c2e56c --- /dev/null +++ b/collectors/likwid/groups/nehalemEX/FLOPS_DP.txt @@ -0,0 +1,35 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED +PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR +PMC2 FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION +PMC3 FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1)/time +Packed [MUOPS/s] 1.0E-06*PMC0/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +SP [MUOPS/s] 1.0E-06*PMC2/time +DP [MUOPS/s] 1.0E-06*PMC3/time + +LONG +Formulas: +DP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime +Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime +SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime +DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime +- +The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done. +Therefore both single as well as double precision are measured to ensure the correctness +of the measurements. You can check if your code was vectorized on the number of +FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR. + diff --git a/collectors/likwid/groups/nehalemEX/FLOPS_SP.txt b/collectors/likwid/groups/nehalemEX/FLOPS_SP.txt new file mode 100644 index 0000000..8046cbd --- /dev/null +++ b/collectors/likwid/groups/nehalemEX/FLOPS_SP.txt @@ -0,0 +1,35 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED +PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR +PMC2 FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION +PMC3 FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1)/time +Packed [MUOPS/s] 1.0E-06*PMC0/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +SP [MUOPS/s] 1.0E-06*PMC2/time +DP [MUOPS/s] 1.0E-06*PMC3/time + +LONG +Formulas: +SP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime +Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime +SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime +DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime +- +The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done. +Therefore both single as well as double precision are measured to ensure the correctness +of the measurements. You can check if your code was vectorized on the number of +FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR. + diff --git a/collectors/likwid/groups/nehalemEX/FLOPS_X87.txt b/collectors/likwid/groups/nehalemEX/FLOPS_X87.txt new file mode 100644 index 0000000..39cd8b4 --- /dev/null +++ b/collectors/likwid/groups/nehalemEX/FLOPS_X87.txt @@ -0,0 +1,21 @@ +SHORT X87 MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 INST_RETIRED_X87 + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +X87 [MFLOP/s] 1.0E-06*PMC0/time + +LONG +Formulas: +X87 [MFLOP/s] = 1.0E-06*INST_RETIRED_X87/runtime +- +Profiling group to measure X87 FLOP rate. + diff --git a/collectors/likwid/groups/nehalemEX/ICACHE.txt b/collectors/likwid/groups/nehalemEX/ICACHE.txt new file mode 100644 index 0000000..49943ff --- /dev/null +++ b/collectors/likwid/groups/nehalemEX/ICACHE.txt @@ -0,0 +1,25 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1I_READS +PMC1 L1I_MISSES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1I request rate PMC0/FIXC0 +L1I miss rate PMC1/FIXC0 +L1I miss ratio PMC1/PMC0 + +LONG +Formulas: +L1I request rate = L1I_READS / INSTR_RETIRED_ANY +L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY +L1I miss ratio = ICACHE_MISSES / L1I_READS +- +This group measures some L1 instruction cache metrics. diff --git a/collectors/likwid/groups/nehalemEX/L2.txt b/collectors/likwid/groups/nehalemEX/L2.txt new file mode 100644 index 0000000..e2715cc --- /dev/null +++ b/collectors/likwid/groups/nehalemEX/L2.txt @@ -0,0 +1,40 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPL +PMC1 L1D_M_EVICT +PMC2 L1I_MISSES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2D load data volume [GBytes] 1.0E-09*PMC0*64.0 +L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time +L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0 + +LONG +Formulas: +L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPL*64.0/time +L2D load data volume [GBytes] = 1.0E-09*L1D_REPL*64.0 +L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time +L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0 +L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64/time +L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64 +- +Profiling group to measure L2 cache bandwidth. The bandwidth is +computed by the number of cache line allocated in the L1 and the +number of modified cache lines evicted from the L1. Also reports on +total data volume transferred between L2 and L1 cache. +Note that this bandwidth also includes data transfers due to a +write allocate load on a store miss in L1 and traffic caused by misses in the +L1 instruction cache. + + diff --git a/collectors/likwid/groups/nehalemEX/L2CACHE.txt b/collectors/likwid/groups/nehalemEX/L2CACHE.txt new file mode 100644 index 0000000..343b263 --- /dev/null +++ b/collectors/likwid/groups/nehalemEX/L2CACHE.txt @@ -0,0 +1,34 @@ +SHORT L2 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_RQSTS_REFERENCES +PMC1 L2_RQSTS_MISS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 request rate PMC0/FIXC0 +L2 miss rate PMC1/FIXC0 +L2 miss ratio PMC1/PMC0 + +LONG +Formulas: +L2 request rate = L2_RQSTS_REFERENCES/INSTR_RETIRED_ANY +L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY +L2 miss ratio = L2_RQSTS_MISS/L2_RQSTS_REFERENCES +- +This group measures the locality of your data accesses with regard to the +L2 cache. L2 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L2 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L2 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/nehalemEX/L3.txt b/collectors/likwid/groups/nehalemEX/L3.txt new file mode 100644 index 0000000..51a0811 --- /dev/null +++ b/collectors/likwid/groups/nehalemEX/L3.txt @@ -0,0 +1,37 @@ +SHORT L3 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_LINES_IN_ANY +PMC1 L2_LINES_OUT_DEMAND_DIRTY +PMC2 L2_LINES_OUT_PREFETCH_DIRTY + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L3 load data volume [GBytes] 1.0E-09*PMC0*64.0 +L3 evict bandwidth [MBytes/s] 1.0E-06*(PMC1+PMC2)*64.0/time +L3 evict data volume [GBytes] 1.0E-09*(PMC1+PMC2)*64.0 +L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time +L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0 + +LONG +Formulas: +L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ANY*64.0/time +L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ANY*64.0 +L3 evict bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_OUT_DEMAND_DIRTY+L2_LINES_OUT_PREFETCH_DIRTY)*64.0/time +L3 evict data volume [GBytes] = 1.0E-09*(L2_LINES_OUT_DEMAND_DIRTY+L2_LINES_OUT_PREFETCH_DIRTY)*64.0 +L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY+L2_LINES_OUT_PREFETCH_DIRTY)*64/time +L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY+L2_LINES_OUT_PREFETCH_DIRTY)*64 +- +Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L2 and the number of modified cache lines +evicted from the L2. Also reports total data volume between L3 and L2 caches. +Note that this bandwidth also includes data transfers due to a write allocate +load on a store miss in L2. + diff --git a/collectors/likwid/groups/nehalemEX/L3CACHE.txt b/collectors/likwid/groups/nehalemEX/L3CACHE.txt new file mode 100644 index 0000000..c6b204e --- /dev/null +++ b/collectors/likwid/groups/nehalemEX/L3CACHE.txt @@ -0,0 +1,48 @@ +SHORT L3 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +CBOX0C0 LLC_HITS_ALL +CBOX0C1 LLC_MISSES_ALL +CBOX1C0 LLC_HITS_ALL +CBOX1C1 LLC_MISSES_ALL +CBOX2C0 LLC_HITS_ALL +CBOX2C1 LLC_MISSES_ALL +CBOX3C0 LLC_HITS_ALL +CBOX3C1 LLC_MISSES_ALL +CBOX4C0 LLC_HITS_ALL +CBOX4C1 LLC_MISSES_ALL +CBOX5C0 LLC_HITS_ALL +CBOX5C1 LLC_MISSES_ALL +CBOX6C0 LLC_HITS_ALL +CBOX6C1 LLC_MISSES_ALL +CBOX7C0 LLC_HITS_ALL +CBOX7C1 LLC_MISSES_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 request rate (CBOX0C0+CBOX0C1+CBOX1C0+CBOX1C1+CBOX2C0+CBOX2C1+CBOX3C0+CBOX3C1+CBOX4C0+CBOX4C1+CBOX5C0+CBOX5C1+CBOX6C0+CBOX6C1+CBOX7C0+CBOX7C1)/FIXC0 +L3 miss rate (CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)/FIXC0 +L3 miss ratio (CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)/(CBOX0C0+CBOX0C1+CBOX1C0+CBOX1C1+CBOX2C0+CBOX2C1+CBOX3C0+CBOX3C1+CBOX4C0+CBOX4C1+CBOX5C0+CBOX5C1+CBOX6C0+CBOX6C1+CBOX7C0+CBOX7C1) + +LONG +Formulas: +L3 request rate = (SUM(LLC_HITS_ALL)+SUM(LLC_MISSES_ALL))/INSTR_RETIRED_ANY +L3 miss rate = SUM(LLC_MISSES_ALL)/INSTR_RETIRED_ANY +L3 miss ratio = SUM(LLC_MISSES_ALL)/(SUM(LLC_HITS_ALL)+SUM(LLC_MISSES_ALL)) +- +This group measures the locality of your data accesses with regard to the +L3 cache. L3 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L3 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L3 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/nehalemEX/MEM.txt b/collectors/likwid/groups/nehalemEX/MEM.txt new file mode 100644 index 0000000..d3d2522 --- /dev/null +++ b/collectors/likwid/groups/nehalemEX/MEM.txt @@ -0,0 +1,43 @@ +SHORT Main memory bandwidth + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +WBOXFIX UNCORE_CLOCK +MBOX0C0 FVC_EV0_BBOX_CMDS_READS +MBOX0C1 DRAM_CMD_CAS_WR_OPN +MBOX1C0 FVC_EV0_BBOX_CMDS_READS +MBOX1C1 DRAM_CMD_CAS_WR_OPN + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +Uncore Clock [MHz] 1.E-06*(WBOXFIX)/time +CPI FIXC1/FIXC0 +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1)*64/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1)*64 + +LONG +Formulas: +Uncore Clock [MHz] = 1.E-06*(UNCORE_CLOCK)/time +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(FVC_EV0_BBOX_CMDS_READS))*64.0/time +Memory read data volume [GBytes] = 1.0E-09*(SUM(FVC_EV0_BBOX_CMDS_READS))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MDRAM_CMD_CAS_WR_OPN))*64.0/time +Memory write data volume [GBytes] = 1.0E-09*(SUM(DRAM_CMD_CAS_WR_OPN))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(FVC_EV0_BBOX_CMDS_READS)+SUM(DRAM_CMD_CAS_WR_OPN))*64.0/time +Memory data volume [GBytes] = 1.0E-09*(SUM(FVC_EV0_BBOX_CMDS_READS)+SUM(DRAM_CMD_CAS_WR_OPN))*64.0 +- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +On Nehalem EX it is not possible to measure the write operations with the +FVC_EV0_BBOX_CMDS_WRITES event at the same time as the FVC_EV0_BBOX_CMDS_READS +because they set contrary bits. The DRAM_CMD_CAS_WR_OPN is an alternative but +it only measures write operations to open pages, hence writes to closed pages +are not included here. + diff --git a/collectors/likwid/groups/nehalemEX/SCHEDULER.txt b/collectors/likwid/groups/nehalemEX/SCHEDULER.txt new file mode 100644 index 0000000..0e43cce --- /dev/null +++ b/collectors/likwid/groups/nehalemEX/SCHEDULER.txt @@ -0,0 +1,25 @@ +SHORT Scheduler Ports + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_EXECUTED_PORT0 +PMC1 UOPS_EXECUTED_PORT1 +PMC2 UOPS_EXECUTED_PORT5 + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Ratio Port 1 PMC1/PMC0 +Ratio Port 5 PMC2/PMC0 + +LONG +Formulas: +Ratio Port 1 = UOPS_EXECUTED_PORT1/UOPS_EXECUTED_PORT0 +Ratio Port 5 = UOPS_EXECUTED_PORT5/UOPS_EXECUTED_PORT0 +- +Measures how many instructions were scheduled on which issue port. + diff --git a/collectors/likwid/groups/nehalemEX/TLB.txt b/collectors/likwid/groups/nehalemEX/TLB.txt new file mode 100644 index 0000000..0e358b8 --- /dev/null +++ b/collectors/likwid/groups/nehalemEX/TLB.txt @@ -0,0 +1,30 @@ +SHORT TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 DTLB_MISSES_ANY +PMC1 L1D_ALL_REF_ANY + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 DTLB request rate PMC1/FIXC0 +L1 DTLB miss rate PMC0/FIXC0 +L1 DTLB miss ratio PMC0/PMC1 + +LONG +Formulas: +L1 DTLB request rate = L1D_ALL_REF_ANY / INSTR_RETIRED_ANY +DTLB miss rate = DTLB_MISSES_ANY / INSTR_RETIRED_ANY +L1 DTLB miss ratio = DTLB_MISSES_ANY / L1D_ALL_REF_ANY +- +L1 DTLB request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The DTLB miss rate gives a measure how often a TLB miss occurred +per instruction. And finally L1 DTLB miss ratio tells you how many +of your memory references required caused a TLB miss on average. + diff --git a/collectors/likwid/groups/nvidia_gpu_cc_ge_7/DATA.txt b/collectors/likwid/groups/nvidia_gpu_cc_ge_7/DATA.txt new file mode 100644 index 0000000..4171640 --- /dev/null +++ b/collectors/likwid/groups/nvidia_gpu_cc_ge_7/DATA.txt @@ -0,0 +1,16 @@ +SHORT Load to store ratio + +EVENTSET +GPU0 SMSP_SASS_INST_EXECUTED_OP_GLOBAL_LD_SUM +GPU1 SMSP_SASS_INST_EXECUTED_OP_GLOBAL_ST_SUM + +METRICS +Runtime (RDTSC) [s] time +Load to store ratio GPU0/GPU1 + +LONG +Formulas: +Load to store ratio = SMSP_SASS_INST_EXECUTED_OP_GLOBAL_LD_SUM/SMSP_SASS_INST_EXECUTED_OP_GLOBAL_ST_SUM +- +This is a metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/nvidia_gpu_cc_ge_7/FLOPS_DP.txt b/collectors/likwid/groups/nvidia_gpu_cc_ge_7/FLOPS_DP.txt new file mode 100644 index 0000000..7c6ae6c --- /dev/null +++ b/collectors/likwid/groups/nvidia_gpu_cc_ge_7/FLOPS_DP.txt @@ -0,0 +1,19 @@ +SHORT Double-precision floating point + +EVENTSET +GPU0 SMSP_SASS_THREAD_INST_EXECUTED_OP_DADD_PRED_ON_SUM +GPU1 SMSP_SASS_THREAD_INST_EXECUTED_OP_DMUL_PRED_ON_SUM +GPU2 SMSP_SASS_THREAD_INST_EXECUTED_OP_DFMA_PRED_ON_SUM + + +METRICS +Runtime (RDTSC) [s] time +DP [MFLOP/s] 1E-6*(GPU0+GPU1+(GPU2*2))/time + + +LONG +Formulas: +DP [MFLOP/s] = 1E-6*(SMSP_SASS_THREAD_INST_EXECUTED_OP_DADD_PRED_ON_SUM+SMSP_SASS_THREAD_INST_EXECUTED_OP_DMUL_PRED_ON_SUM+2*SMSP_SASS_THREAD_INST_EXECUTED_OP_DFMA_PRED_ON_SUM)/time +-- +This group measures the double-precision floating-point operations per second using the events +SMSP_SASS_THREAD_INST_EXECUTED_OP_D{ADD, MUL, FMA}_PRED_ON_SUM. diff --git a/collectors/likwid/groups/nvidia_gpu_cc_ge_7/FLOPS_HP.txt b/collectors/likwid/groups/nvidia_gpu_cc_ge_7/FLOPS_HP.txt new file mode 100644 index 0000000..76c78cd --- /dev/null +++ b/collectors/likwid/groups/nvidia_gpu_cc_ge_7/FLOPS_HP.txt @@ -0,0 +1,19 @@ +SHORT Half-precision floating point + +EVENTSET +GPU0 SMSP_SASS_THREAD_INST_EXECUTED_OP_HADD_PRED_ON_SUM +GPU1 SMSP_SASS_THREAD_INST_EXECUTED_OP_HMUL_PRED_ON_SUM +GPU2 SMSP_SASS_THREAD_INST_EXECUTED_OP_HFMA_PRED_ON_SUM + + +METRICS +Runtime (RDTSC) [s] time +HP [MFLOP/s] 1E-6*(GPU0+GPU1+(GPU2*2))/time + + +LONG +Formulas: +HP [MFLOP/s] = 1E-6*(SMSP_SASS_THREAD_INST_EXECUTED_OP_HADD_PRED_ON_SUM+SMSP_SASS_THREAD_INST_EXECUTED_OP_HMUL_PRED_ON_SUM+2*SMSP_SASS_THREAD_INST_EXECUTED_OP_HFMA_PRED_ON_SUM)/time +-- +This group measures the half-precision floating-point operations per second using the events +SMSP_SASS_THREAD_INST_EXECUTED_OP_H{ADD, MUL, FMA}_PRED_ON_SUM. diff --git a/collectors/likwid/groups/nvidia_gpu_cc_ge_7/FLOPS_SP.txt b/collectors/likwid/groups/nvidia_gpu_cc_ge_7/FLOPS_SP.txt new file mode 100644 index 0000000..cdc7a4e --- /dev/null +++ b/collectors/likwid/groups/nvidia_gpu_cc_ge_7/FLOPS_SP.txt @@ -0,0 +1,19 @@ +SHORT Single-precision floating point + +EVENTSET +GPU0 SMSP_SASS_THREAD_INST_EXECUTED_OP_FADD_PRED_ON_SUM +GPU1 SMSP_SASS_THREAD_INST_EXECUTED_OP_FMUL_PRED_ON_SUM +GPU2 SMSP_SASS_THREAD_INST_EXECUTED_OP_FFMA_PRED_ON_SUM + + +METRICS +Runtime (RDTSC) [s] time +SP [MFLOP/s] 1E-6*(GPU0+GPU1+(GPU2*2))/time + + +LONG +Formulas: +SP [MFLOP/s] = 1E-6*(SMSP_SASS_THREAD_INST_EXECUTED_OP_FADD_PRED_ON_SUM+SMSP_SASS_THREAD_INST_EXECUTED_OP_FMUL_PRED_ON_SUM+2*SMSP_SASS_THREAD_INST_EXECUTED_OP_FFMA_PRED_ON_SUM)/time +-- +This group measures the single-precision floating-point operations per second using the events +SMSP_SASS_THREAD_INST_EXECUTED_OP_F{ADD, MUL, FMA}_PRED_ON_SUM. diff --git a/collectors/likwid/groups/nvidia_gpu_cc_lt_7/DATA.txt b/collectors/likwid/groups/nvidia_gpu_cc_lt_7/DATA.txt new file mode 100644 index 0000000..b96bf08 --- /dev/null +++ b/collectors/likwid/groups/nvidia_gpu_cc_lt_7/DATA.txt @@ -0,0 +1,20 @@ +SHORT Load to store ratio + +EVENTSET +GPU0 GLOBAL_LOAD +GPU1 GLOBAL_STORE +GPU2 INST_EXECUTED +GPU3 ACTIVE_CYCLES + +METRICS +Runtime (RDTSC) [s] time +CPI GPU3/GPU2 +Load to store ratio GPU0/GPU1 + +LONG +Formulas: +CPI = ACTIVE_CYCLES/INST_EXECUTED +Load to store ratio = GENERIC_LOAD/GENERIC_STORE +- +This is a metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/nvidia_gpu_cc_lt_7/FLOPS_DP.txt b/collectors/likwid/groups/nvidia_gpu_cc_lt_7/FLOPS_DP.txt new file mode 100644 index 0000000..c03ac90 --- /dev/null +++ b/collectors/likwid/groups/nvidia_gpu_cc_lt_7/FLOPS_DP.txt @@ -0,0 +1,19 @@ +SHORT Double-precision floating point + +EVENTSET +GPU0 INST_EXECUTED_FP64_PIPE_S0 +GPU1 INST_EXECUTED_FP64_PIPE_S1 +GPU2 INST_EXECUTED_FP64_PIPE_S2 +GPU3 INST_EXECUTED_FP64_PIPE_S3 + + +METRICS +DP [MFLOP/s] 1E-6*(GPU0+GPU1+GPU2+GPU3)/time + + +LONG +Formulas: +DP [MFLOP/s] = 1E-6*(SUM(INST_EXECUTED_FP64_PIPE_S*))/time +-- +This group measures the double precision floating-point operations per second using the events +INST_EXECUTED_FP64_PIPE_S*. diff --git a/collectors/likwid/groups/nvidia_gpu_cc_lt_7/FLOPS_SP.txt b/collectors/likwid/groups/nvidia_gpu_cc_lt_7/FLOPS_SP.txt new file mode 100644 index 0000000..09f58ee --- /dev/null +++ b/collectors/likwid/groups/nvidia_gpu_cc_lt_7/FLOPS_SP.txt @@ -0,0 +1,20 @@ +SHORT Single-precision floating point + +EVENTSET +GPU0 INST_EXECUTED_FP64_PIPE_S0 +GPU1 INST_EXECUTED_FP64_PIPE_S1 +GPU2 INST_EXECUTED_FP64_PIPE_S2 +GPU3 INST_EXECUTED_FP64_PIPE_S3 + + +METRICS +SP [MFLOP/s] 1E-6*(GPU0+GPU1+GPU2+GPU3)/time + + +LONG +Formulas: +SP [MFLOP/s] = 1E-6*(SUM(INST_EXECUTED_FP64_PIPE_S*))/time +-- +This group measures the single precision floating-point operations per second using the events +INST_EXECUTED_FP64_PIPE_S*. Unfortunately, not all GPUs provide these events although they provide +a metric for SP FP operations which are currently not usable with LIKWID. diff --git a/collectors/likwid/groups/pentiumm/BRANCH.txt b/collectors/likwid/groups/pentiumm/BRANCH.txt new file mode 100644 index 0000000..269a500 --- /dev/null +++ b/collectors/likwid/groups/pentiumm/BRANCH.txt @@ -0,0 +1,17 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +PMC0 BR_INST_EXEC +PMC1 BR_MISSP_EXEC + +METRICS +Runtime (RDTSC) [s] time +Branch misprediction ratio PMC1/PMC0 + +LONG +Formulas: +Branch misprediction ratio = BR_MISSP_EXEC / BR_INST_EXEC +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. diff --git a/collectors/likwid/groups/pentiumm/CPI.txt b/collectors/likwid/groups/pentiumm/CPI.txt new file mode 100644 index 0000000..ae4aa26 --- /dev/null +++ b/collectors/likwid/groups/pentiumm/CPI.txt @@ -0,0 +1,22 @@ +SHORT Cycles per instruction + +EVENTSET +PMC0 UOPS_RETIRED +PMC1 CPU_CLK_UNHALTED + +METRICS +Runtime (RDTSC) [s] time +CPI PMC1/PMC0 +IPC PMC0/PMC1 + +LONG +Formulas: +CPI = CPU_CLK_UNHALTED/UOPS_RETIRED +IPC = UOPS_RETIRED/CPU_CLK_UNHALTED +- +This group measures how efficient the processor works with +regard to instruction throughput. Also important as a standalone +metric is UOPS_RETIRED as it tells you how many uops +you need to execute for a task. An optimization might show very +low CPI values but execute many more instruction for it. + diff --git a/collectors/likwid/groups/pentiumm/FLOPS_DP.txt b/collectors/likwid/groups/pentiumm/FLOPS_DP.txt new file mode 100644 index 0000000..058a64e --- /dev/null +++ b/collectors/likwid/groups/pentiumm/FLOPS_DP.txt @@ -0,0 +1,20 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +PMC0 EMON_SSE_SSE2_COMP_INST_RETIRED_PACKED_DP +PMC1 EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_DP + +METRICS +Runtime (RDTSC) [s] time +DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1)/time +Packed [MUOPS/s] 1.0E-06*(PMC0)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time + +LONG +Formulas: +DP [MFLOP/s] = (EMON_SSE_SSE2_COMP_INST_RETIRED_PACKED_DP*2 + EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_DP )/ runtime +Packed [MUOPS/s] = 1.0E-06*(EMON_SSE_SSE2_COMP_INST_RETIRED_PACKED_DP)/time +Scalar [MUOPS/s] = 1.0E-06*EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_DP/time +- +SSE scalar and packed double precision FLOP rates. + diff --git a/collectors/likwid/groups/pentiumm/FLOPS_SP.txt b/collectors/likwid/groups/pentiumm/FLOPS_SP.txt new file mode 100644 index 0000000..d70b835 --- /dev/null +++ b/collectors/likwid/groups/pentiumm/FLOPS_SP.txt @@ -0,0 +1,18 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +PMC0 EMON_SSE_SSE2_COMP_INST_RETIRED_ALL_SP +PMC1 EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_SP + +METRICS +Runtime (RDTSC) [s] time +SP [MFLOP/s] 1.0E-06*(PMC0)/time +Scalar [MUOPS/s] 1.0E-06*(PMC1)/time + +LONG +Formulas: +SP [MFLOP/s] = (EMON_SSE_SSE2_COMP_INST_RETIRED_ALL_SP)/ runtime +Scalar [MUOPS/s] = (EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_SP)/ runtime +- +SSE scalar and packed single precision FLOP rates. + diff --git a/collectors/likwid/groups/pentiumm/L3.txt b/collectors/likwid/groups/pentiumm/L3.txt new file mode 100644 index 0000000..2ed5293 --- /dev/null +++ b/collectors/likwid/groups/pentiumm/L3.txt @@ -0,0 +1,30 @@ +SHORT L3 cache bandwidth in MBytes/s + +EVENTSET +PMC0 L2_LINES_IN_ALL_ALL +PMC1 L2_LINES_OUT_ALL_ALL + +METRICS +Runtime (RDTSC) [s] time +L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L3 load data volume [GBytes] 1.0E-09*PMC0*64.0 +L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 + +LONG +Formulas: +L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL_ALL*64.0/time +L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL_ALL*64.0 +L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_ALL_ALL*64.0/time +L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_ALL_ALL*64.0 +L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL_ALL+L2_LINES_OUT_ALL_ALL)*64/time +L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL_ALL+L2_LINES_OUT_ALL_ALL)*64 +- +Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L2 and the number of modified cache lines +evicted from the L2. The group also output total data volume transferred between +L2. Note that this bandwidth also includes data transfers due to a write +allocate load on a store miss in L2. + diff --git a/collectors/likwid/groups/phi/CACHE.txt b/collectors/likwid/groups/phi/CACHE.txt new file mode 100644 index 0000000..01ac5e4 --- /dev/null +++ b/collectors/likwid/groups/phi/CACHE.txt @@ -0,0 +1,22 @@ +SHORT L1 compute to data access ratio + +EVENTSET +PMC0 VPU_ELEMENTS_ACTIVE +PMC1 DATA_READ_OR_WRITE + +METRICS +Runtime (RDTSC) [s] time +L1 compute intensity PMC0/PMC1 + +LONG +Formulas: +L1 compute intensity = VPU_ELEMENTS_ACTIVE/DATA_READ_OR_WRITE +- +These metric is a way to measure the computational density of an +application, or how many computations it is performing on average for each +piece of data loaded. L1 compute to data access ratio should be +used to judge suitability of an application for running on the Intel MIC +architecture. Applications that will perform well on the Intel MIC +architecture should be vectorized, and ideally be able to perform multiple +operations on the same pieces of data (or same cache lines). + diff --git a/collectors/likwid/groups/phi/COMPUTE_TO_DATA_RATIO.txt b/collectors/likwid/groups/phi/COMPUTE_TO_DATA_RATIO.txt new file mode 100644 index 0000000..6fdd008 --- /dev/null +++ b/collectors/likwid/groups/phi/COMPUTE_TO_DATA_RATIO.txt @@ -0,0 +1,22 @@ +SHORT L2 compute to data access ratio + +EVENTSET +PMC0 VPU_ELEMENTS_ACTIVE +PMC1 DATA_READ_MISS_OR_WRITE_MISS + +METRICS +Runtime (RDTSC) [s] time +L2 compute intensity PMC0/PMC1 + +LONG +Formulas: +L2 compute intensity = VPU_ELEMENTS_ACTIVE/DATA_READ_MISS_OR_WRITE_MISS +- +These metric is a way to measure the computational density of an +application, or how many computations it is performing on average for each +piece of data loaded. L2 compute to data access ratio should be +used to judge suitability of an application for running on the Intel MIC +architecture. Applications that will perform well on the Intel MIC +architecture should be vectorized, and ideally be able to perform multiple +operations on the same pieces of data (or same cache lines). + diff --git a/collectors/likwid/groups/phi/CPI.txt b/collectors/likwid/groups/phi/CPI.txt new file mode 100644 index 0000000..f3d8b4e --- /dev/null +++ b/collectors/likwid/groups/phi/CPI.txt @@ -0,0 +1,23 @@ +SHORT Cycles per instruction + +EVENTSET +PMC0 INSTRUCTIONS_EXECUTED +PMC1 CPU_CLK_UNHALTED + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] PMC1*inverseClock +CPI PMC1/PMC0 +IPC PMC0/PMC1 + +LONG +Formulas: +CPI = CPU_CLK_UNHALTED/INSTRUCTIONS_EXECUTED +IPC = INSTRUCTIONS_EXECUTED/CPU_CLK_UNHALTED +- +This group measures how efficient the processor works with +regard to instruction throughput. Also important as a standalone +metric is INSTRUCTIONS_RETIRED as it tells you how many instruction +you need to execute for a task. An optimization might show very +low CPI values but execute many more instruction for it. + diff --git a/collectors/likwid/groups/phi/MEM.txt b/collectors/likwid/groups/phi/MEM.txt new file mode 100644 index 0000000..8899592 --- /dev/null +++ b/collectors/likwid/groups/phi/MEM.txt @@ -0,0 +1,18 @@ +SHORT Memory bandwidth + +EVENTSET +PMC0 DATA_READ_MISS_OR_WRITE_MISS +PMC1 DATA_CACHE_LINES_WRITTEN_BACK + + +METRICS +Runtime (RDTSC) [s] time +Memory bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 + +LONG +Formulas: +Memory bandwidth [MBytes/s] = 1.0E-06*(DATA_READ_MISS_OR_WRITE_MISS+DATA_CACHE_LINES_WRITTEN_BACK)*64.0/time +Memory data volume [GBytes] = 1.0E-09*(DATA_READ_MISS_OR_WRITE_MISS+DATA_CACHE_LINES_WRITTEN_BACK)*64.0 +- +Total memory bandwidth and data volume. diff --git a/collectors/likwid/groups/phi/MEM1.txt b/collectors/likwid/groups/phi/MEM1.txt new file mode 100644 index 0000000..c9f7fb6 --- /dev/null +++ b/collectors/likwid/groups/phi/MEM1.txt @@ -0,0 +1,18 @@ +SHORT L2 write misses + +EVENTSET +PMC0 L2_DATA_WRITE_MISS_MEM_FILL + +METRICS +Runtime (RDTSC) [s] time +L2 RFO bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2 RFO data volume [GBytes] 1.0E-09*PMC0*64.0 + +LONG +Formulas: +L2 RFO bandwidth [MBytes/s] = 1.0E-06*L2_DATA_WRITE_MISS_MEM_FILL*64.0/time +L2 RFO data volume [GBytes] = 1.0E-09*L2_DATA_WRITE_MISS_MEM_FILL*64.0 +- +Bandwidth and data volume fetched from memory due to a L2 data write miss. These +fetches are commonly called write-allocate loads or read-for-ownership (RFO). + diff --git a/collectors/likwid/groups/phi/MEM2.txt b/collectors/likwid/groups/phi/MEM2.txt new file mode 100644 index 0000000..d44a823 --- /dev/null +++ b/collectors/likwid/groups/phi/MEM2.txt @@ -0,0 +1,17 @@ +SHORT L2 read misses + +EVENTSET +PMC0 L2_DATA_READ_MISS_MEM_FILL + +METRICS +Runtime (RDTSC) [s] time +L2 read bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2 read data volume [GBytes] 1.0E-09*PMC0*64.0 + +LONG +Formulas: +L2 read bandwidth [MBytes/s] = 1.0E-06*L2_DATA_READ_MISS_MEM_FILL*64.0/time +L2 read data volume [GBytes] = 1.0E-09*L2_DATA_READ_MISS_MEM_FILL*64.0 +- +The data volume and bandwidth caused by read misses in the L2 cache. + diff --git a/collectors/likwid/groups/phi/MEM3.txt b/collectors/likwid/groups/phi/MEM3.txt new file mode 100644 index 0000000..73de570 --- /dev/null +++ b/collectors/likwid/groups/phi/MEM3.txt @@ -0,0 +1,17 @@ +SHORT HW prefetch transfers + +EVENTSET +PMC0 HWP_L2MISS + +METRICS +Runtime (RDTSC) [s] time +Prefetch bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +Prefetch data volume [GBytes] 1.0E-09*PMC0*64.0 + +LONG +Formulas: +Prefetch bandwidth [MBytes/s] = 1.0E-06*HWP_L2MISS*64.0/time +Prefetch data volume [GBytes] = 1.0E-09*HWP_L2MISS*64.0 +- +The bandwidth and data volume caused by L2 misses from the hardware prefetcher. + diff --git a/collectors/likwid/groups/phi/MEM4.txt b/collectors/likwid/groups/phi/MEM4.txt new file mode 100644 index 0000000..9e892bd --- /dev/null +++ b/collectors/likwid/groups/phi/MEM4.txt @@ -0,0 +1,17 @@ +SHORT L2 victom requests + +EVENTSET +PMC0 L2_VICTIM_REQ_WITH_DATA + +METRICS +Runtime (RDTSC) [s] time +Victim bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +Victim data volume [GBytes] 1.0E-09*PMC0*64.0 + +LONG +Formulas: +Victim bandwidth [MBytes/s] = 1.0E-06*L2_VICTIM_REQ_WITH_DATA*64.0/time +Victim data volume [GBytes] = 1.0E-09*L2_VICTIM_REQ_WITH_DATA*64.0 +- +Data volume and bandwidth caused by cache line victims. + diff --git a/collectors/likwid/groups/phi/MEM5.txt b/collectors/likwid/groups/phi/MEM5.txt new file mode 100644 index 0000000..49acb98 --- /dev/null +++ b/collectors/likwid/groups/phi/MEM5.txt @@ -0,0 +1,19 @@ +SHORT L2 snoop hits + +EVENTSET +PMC0 SNP_HITM_L2 + +METRICS +Runtime (RDTSC) [s] time +Snoop bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +Snoop data volume [GBytes] 1.0E-09*PMC0*64.0 + +LONG +Formulas: +Snoop bandwidth [MBytes/s] = 1.0E-06*SNP_HITM_L2*64.0/time +Snoop data volume [GBytes] = 1.0E-09*SNP_HITM_L2*64.0 +- +Snoop traffic caused by HITM requests. HITM requests are L2 requests that +are served by another core's L2 cache but the remote cache line is in modified +state. + diff --git a/collectors/likwid/groups/phi/MEM6.txt b/collectors/likwid/groups/phi/MEM6.txt new file mode 100644 index 0000000..835faf8 --- /dev/null +++ b/collectors/likwid/groups/phi/MEM6.txt @@ -0,0 +1,17 @@ +SHORT L2 read misses + +EVENTSET +PMC0 L2_READ_MISS + +METRICS +Runtime (RDTSC) [s] time +L2 read bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2 read data volume [GBytes] 1.0E-09*PMC0*64.0 + +LONG +Formulas: +L2 read bandwidth [MBytes/s] = 1.0E-06*L2_READ_MISS*64.0/time +L2 read data volume [GBytes] = 1.0E-09*L2_READ_MISS*64.0 +- +Data volume and bandwidth caused by read misses in the L2 cache. + diff --git a/collectors/likwid/groups/phi/MEM_READ.txt b/collectors/likwid/groups/phi/MEM_READ.txt new file mode 100644 index 0000000..fb107b0 --- /dev/null +++ b/collectors/likwid/groups/phi/MEM_READ.txt @@ -0,0 +1,20 @@ +SHORT Memory read bandwidth + +EVENTSET +PMC0 DATA_READ_MISS +PMC1 HWP_L2MISS + + +METRICS +Runtime (RDTSC) [s] time +Memory read bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 + +LONG +Formulas: +Memory read bandwidth [MBytes/s] = 1.0E-06*(L2_DATA_READ_MISS_MEM_FILL+HWP_L2MISS)*64.0/time +Memory read data volume [GBytes] = 1.0E-09*(L2_DATA_READ_MISS_MEM_FILL+HWP_L2MISS)*64.0 +- +Bandwidth and data volume of read operations from the memory to L2 cache. The +metric is introduced in the book 'Intel Xeon Phi Coprocessor High-Performance +Programming' by James Jeffers and James Reinders. diff --git a/collectors/likwid/groups/phi/MEM_WRITE.txt b/collectors/likwid/groups/phi/MEM_WRITE.txt new file mode 100644 index 0000000..01043fd --- /dev/null +++ b/collectors/likwid/groups/phi/MEM_WRITE.txt @@ -0,0 +1,20 @@ +SHORT Memory write bandwidth + +EVENTSET +PMC0 L2_VICTIM_REQ_WITH_DATA +PMC1 SNP_HITM_L2 + + +METRICS +Runtime (RDTSC) [s] time +Memory write bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 + +LONG +Formulas: +Memory write bandwidth [MBytes/s] = 1.0E-06*(L2_VICTIM_REQ_WITH_DATA+SNP_HITM_L2)*64.0/time +Memory write data volume [GBytes] = 1.0E-09*(L2_VICTIM_REQ_WITH_DATA+SNP_HITM_L2)*64.0 +- +Bandwidth and data volume of write operations from the L2 cache to memory. The +metric is introduced in the book 'Intel Xeon Phi Coprocessor High-Performance +Programming' by James Jeffers and James Reinders. diff --git a/collectors/likwid/groups/phi/PAIRING.txt b/collectors/likwid/groups/phi/PAIRING.txt new file mode 100644 index 0000000..ce3627c --- /dev/null +++ b/collectors/likwid/groups/phi/PAIRING.txt @@ -0,0 +1,21 @@ +SHORT Pairing ratio + +EVENTSET +PMC0 INSTRUCTIONS_EXECUTED +PMC1 INSTRUCTIONS_EXECUTED_V_PIPE + +METRICS +Runtime (RDTSC) [s] time +V-pipe ratio PMC1/PMC0 +Pairing ratio PMC1/(PMC0-PMC1) + +LONG +Formulas: +V-pipe ratio = INSTRUCTIONS_EXECUTED_V_PIPE/INSTRUCTIONS_EXECUTED +Pairing ratio = INSTRUCTIONS_EXECUTED_V_PIPE/(INSTRUCTIONS_EXECUTED-INSTRUCTIONS_EXECUTED_V_PIPE) +- +Each hardware thread on the Xeon Phi can execute two instruction simultaneously, +one in the U-pipe and one in the V-pipe. But this is only possible if the +instructions can be paired. The instructions executed in paired fashion are counted +by the event INSTRUCTIONS_EXECUTED_V_PIPE. The event INSTRUCTIONS_EXECUTED increments +for each instruction, hence the maximal increase per cycle can be 2. diff --git a/collectors/likwid/groups/phi/READ_MISS_RATIO.txt b/collectors/likwid/groups/phi/READ_MISS_RATIO.txt new file mode 100644 index 0000000..dbdaad5 --- /dev/null +++ b/collectors/likwid/groups/phi/READ_MISS_RATIO.txt @@ -0,0 +1,15 @@ +SHORT Miss ratio fof data reads + +EVENTSET +PMC0 DATA_READ +PMC1 DATA_READ_MISS + +METRICS +Runtime (RDTSC) [s] time +Read miss ratio PMC1/PMC0 + +LONG +Formulas: +Read miss ratio = DATA_READ_MISS/DATA_READ +-- +Miss ratio for data reads. diff --git a/collectors/likwid/groups/phi/TLB.txt b/collectors/likwid/groups/phi/TLB.txt new file mode 100644 index 0000000..6f00359 --- /dev/null +++ b/collectors/likwid/groups/phi/TLB.txt @@ -0,0 +1,23 @@ +SHORT TLB Misses + +EVENTSET +PMC0 LONG_DATA_PAGE_WALK +PMC1 DATA_PAGE_WALK + +METRICS +Runtime (RDTSC) [s] time +L1 TLB misses [misses/s] PMC1/time +L2 TLB misses [misses/s] PMC0/time +L1 TLB misses per L2 TLB miss PMC1/PMC0 + +LONG +Formulas: +L1 TLB misses [misses/s] = DATA_PAGE_WALK/time +L2 TLB misses [misses/s] = LONG_DATA_PAGE_WALK/time +L1 TLB misses per L2 TLB miss = DATA_PAGE_WALK/LONG_DATA_PAGE_WALK +- +Analysis of the layered TLB of the Intel Xeon Phi. According to the book +'Intel Xeon Phi Coprocessor High-Performance Programming' by James Jeffers and +James Reinders, a high L1 TLB misses per L2 TLB miss ratio suggests that your +working set fits into the L2 TLB but not in L1 TLB. Using large pages may be +beneficial. diff --git a/collectors/likwid/groups/phi/TLB_L1.txt b/collectors/likwid/groups/phi/TLB_L1.txt new file mode 100644 index 0000000..d826d04 --- /dev/null +++ b/collectors/likwid/groups/phi/TLB_L1.txt @@ -0,0 +1,23 @@ +SHORT L1 TLB misses + +EVENTSET +PMC0 DATA_PAGE_WALK +PMC1 DATA_READ_OR_WRITE + +METRICS +Runtime (RDTSC) [s] time +L1 TLB misses [misses/s] PMC0/time +L1 TLB miss ratio PMC0/PMC1 + +LONG +Formulas: +L1 TLB misses [misses/s] = DATA_PAGE_WALK/time +L1 TLB miss ratio = DATA_PAGE_WALK/DATA_READ_OR_WRITE +- +This performance group measures the L1 TLB misses. A L1 TLB miss that hits the +L2 TLB has a penelty of about 25 cycles for 4kB pages. For 2MB pages, the penelty +for a L1 TLB miss that hits L2 TLB is about 8 cycles. The minimal L1 TLB miss ratio +is about 1/64, so a high ratio indicates a bad spartial locality. Data of a page +is only partly accessed. It can also indicate trashing because when multiple pages +are accessed in a loop iteration, the size and associativity is not sufficient to +hold all pages. diff --git a/collectors/likwid/groups/phi/TLB_L2.txt b/collectors/likwid/groups/phi/TLB_L2.txt new file mode 100644 index 0000000..9a95125 --- /dev/null +++ b/collectors/likwid/groups/phi/TLB_L2.txt @@ -0,0 +1,21 @@ +SHORT L2 TLB misses + +EVENTSET +PMC0 LONG_DATA_PAGE_WALK +PMC1 DATA_READ_OR_WRITE + +METRICS +Runtime (RDTSC) [s] time +L2 TLB misses [misses/s] PMC0/time +L2 TLB miss ratio PMC0/PMC1 + +LONG +Formulas: +L2 TLB misses [misses/s] = LONG_DATA_PAGE_WALK/time +L2 TLB miss ratio = LONG_DATA_PAGE_WALK/DATA_READ_OR_WRITE +- +This performance group measures the L2 TLB misses. A L2 TLB miss has a penelty +of at least 100 cycles, hence it is important to avoid them. A high ratio can +indicate trashing because when multiple pages are accessed in a loop iteration, +the size and associativity is not sufficient to hold all pages. This would also +result in a bad ratio for the L1 TLB. diff --git a/collectors/likwid/groups/phi/VECTOR.txt b/collectors/likwid/groups/phi/VECTOR.txt new file mode 100644 index 0000000..b6ec6a6 --- /dev/null +++ b/collectors/likwid/groups/phi/VECTOR.txt @@ -0,0 +1,21 @@ +SHORT Vectorization intensity + +EVENTSET +PMC0 VPU_INSTRUCTIONS_EXECUTED +PMC1 VPU_ELEMENTS_ACTIVE + +METRICS +Runtime (RDTSC) [s] time +Vectorization intensity PMC1/PMC0 + +LONG +Formulas: +Vectorization intensity = VPU_ELEMENTS_ACTIVE / VPU_INSTRUCTIONS_EXECUTED +- +Vector instructions include instructions that perform floating-point +operations, instructions that load vector registers from memory and store them +to memory, instructions to manipulate vector mask registers, and other special +purpose instructions such as vector shuffle. +According to the book 'Intel Xeon Phi Coprocessor High-Performance Programming' +by James Jeffers and James Reinders, the vectorization intensity should be >=8 +for double precision and >=16 for single precision. diff --git a/collectors/likwid/groups/phi/VECTOR2.txt b/collectors/likwid/groups/phi/VECTOR2.txt new file mode 100644 index 0000000..52b3c59 --- /dev/null +++ b/collectors/likwid/groups/phi/VECTOR2.txt @@ -0,0 +1,20 @@ +SHORT Vector unit usage + +EVENTSET +PMC0 VPU_INSTRUCTIONS_EXECUTED +PMC1 VPU_STALL_REG + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] PMC1*inverseClock +VPU stall ratio [%] 100*(VPU_STALL_REG/PMC0) + +LONG +Formulas: +VPU stall ratio [%] = 100*(VPU_STALL_REG/VPU_INSTRUCTIONS_EXECUTED) +-- +This group measures how efficient the processor works with +regard to vectorization instruction throughput. The event VPU_STALL_REG counts +the VPU stalls due to data dependencies. Dependencies are read-after-write, +write-after-write and write-after-read. + diff --git a/collectors/likwid/groups/phi/VPU_FILL_RATIO_DBL.txt b/collectors/likwid/groups/phi/VPU_FILL_RATIO_DBL.txt new file mode 100644 index 0000000..6e8065c --- /dev/null +++ b/collectors/likwid/groups/phi/VPU_FILL_RATIO_DBL.txt @@ -0,0 +1,18 @@ +SHORT VPU filling for double precisiof data + +EVENTSET +PMC0 VPU_INSTRUCTIONS_EXECUTED +PMC1 VPU_ELEMENTS_ACTIVE + +METRICS +Runtime (RDTSC) [s] time +VPU fill ratio PMC0*8/PMC1 + +LONG +Formulas: +VPU fill ratio = VPU_INSTRUCTIONS_EXECUTED*8/VPU_ELEMENTS_ACTIVE +-- +This performance group measures the number of vector instructions that are +performed on each vector loaded to the VPU. It is important to increate the +ratio to get a high throughput because memory accesses (loading data to the VPU) +are expensive. diff --git a/collectors/likwid/groups/phi/VPU_PAIRING.txt b/collectors/likwid/groups/phi/VPU_PAIRING.txt new file mode 100644 index 0000000..024919b --- /dev/null +++ b/collectors/likwid/groups/phi/VPU_PAIRING.txt @@ -0,0 +1,20 @@ +SHORT VPU pairing ratio + +EVENTSET +PMC0 VPU_INSTRUCTIONS_EXECUTED +PMC1 VPU_INSTRUCTIONS_EXECUTED_V_PIPE + +METRICS +Runtime (RDTSC) [s] time +V-pipe ratio PMC1/PMC0 +Pairing ratio PMC1/(PMC0-PMC1) + +LONG +Formulas: +V-pipe ratio = VPU_INSTRUCTIONS_EXECUTED_V_PIPE/VPU_INSTRUCTIONS_EXECUTED +Pairing ratio = VPU_INSTRUCTIONS_EXECUTED_V_PIPE/(VPU_INSTRUCTIONS_EXECUTED-VPU_INSTRUCTIONS_EXECUTED_V_PIPE) +-- +This performance group measures the pairing ratio of vector instructions. The +V-pipe can only execute a subset of all instruction, the main workload is done +by the U-pipe. A higher throughput can be achieved if the pairing ratio is +increased. diff --git a/collectors/likwid/groups/phi/VPU_READ_MISS_RATIO.txt b/collectors/likwid/groups/phi/VPU_READ_MISS_RATIO.txt new file mode 100644 index 0000000..cf04c5f --- /dev/null +++ b/collectors/likwid/groups/phi/VPU_READ_MISS_RATIO.txt @@ -0,0 +1,16 @@ +SHORT Miss ratio for VPU data reads + +EVENTSET +PMC0 VPU_DATA_READ +PMC1 VPU_DATA_READ_MISS + +METRICS +Runtime (RDTSC) [s] time +VPU read miss ratio PMC1/PMC0 + +LONG +Formulas: +VPU read miss ratio = PMC1/PMC0 +-- +This performance group determines the ratio between reads and reads that miss +the cache and are issued by the VPU. diff --git a/collectors/likwid/groups/phi/VPU_WRITE_MISS_RATIO.txt b/collectors/likwid/groups/phi/VPU_WRITE_MISS_RATIO.txt new file mode 100644 index 0000000..cebf3c7 --- /dev/null +++ b/collectors/likwid/groups/phi/VPU_WRITE_MISS_RATIO.txt @@ -0,0 +1,16 @@ +SHORT Miss ratio for VPU data writes + +EVENTSET +PMC0 VPU_DATA_WRITE +PMC1 VPU_DATA_WRITE_MISS + +METRICS +Runtime (RDTSC) [s] time +VPU write miss ratio PMC1/PMC0 + +LONG +Formulas: +VPU write miss ratio = PMC1/PMC0 +-- +This performance group determines the ratio between writes and writes that miss +the cache and are issued by the VPU. diff --git a/collectors/likwid/groups/phi/WRITE_MISS_RATIO.txt b/collectors/likwid/groups/phi/WRITE_MISS_RATIO.txt new file mode 100644 index 0000000..1e92c76 --- /dev/null +++ b/collectors/likwid/groups/phi/WRITE_MISS_RATIO.txt @@ -0,0 +1,15 @@ +SHORT Miss ratio fof data writes + +EVENTSET +PMC0 DATA_WRITE +PMC1 DATA_WRITE_MISS + +METRICS +Runtime (RDTSC) [s] time +Write miss ratio PMC1/PMC0 + +LONG +Formulas: +Write miss ratio = DATA_WRITE_MISS/DATA_WRITE +-- +Miss ratio fof data writes. diff --git a/collectors/likwid/groups/power8/BRANCH.txt b/collectors/likwid/groups/power8/BRANCH.txt new file mode 100644 index 0000000..870bb9d --- /dev/null +++ b/collectors/likwid/groups/power8/BRANCH.txt @@ -0,0 +1,30 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +PMC0 PM_BR_PRED_BR_CMPL +PMC1 PM_BR_PRED_CCACHE_CMPL +PMC2 PM_BR_PRED_CR_CMPL +PMC3 PM_BR_MPRED_CMPL +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +Branch rate (PMC0+PMC1+PMC2)/PMC4 +Branch misprediction rate PMC3/PMC4 +Branch misprediction ratio PMC4/(PMC0+PMC1+PMC2) +Instructions per branch PMC4/(PMC0+PMC1+PMC2) + +LONG +Formulas: +Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES +Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES +- +The rates state how often in average a branch or a mispredicted branch occured +per instruction retired in total. The Branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/Branch rate. + diff --git a/collectors/likwid/groups/power8/CPISTACK1.txt b/collectors/likwid/groups/power8/CPISTACK1.txt new file mode 100644 index 0000000..aa8a643 --- /dev/null +++ b/collectors/likwid/groups/power8/CPISTACK1.txt @@ -0,0 +1,35 @@ +SHORT First level of IBM CPI stack + +EVENTSET +PMC0 PM_CMPLU_STALL_THRD +PMC1 PM_GCT_EMPTY_CYC +PMC3 PM_CMPLU_STALL +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +CPI PMC5/PMC4 +Stall cycles PMC3 +Stall cycle ratio PMC3/PMC5 +Thread blocked cycles PMC0 +Thread blocked cycle ratio PMC0/PMC5 +GCT empty cycles PMC1 +GCT empty cycle ratio PMC1/PM5 + + + + +LONG +Formulas: +Stall cycles = PM_CMPLU_STALL +Stall cycle ratio = PM_CMPLU_STALL/PM_RUN_CYC +Thread blocked cycles = PM_CMPLU_STALL_THRD +Thread blocked cycle ratio = PM_CMPLU_STALL_THRD/PM_RUN_CYC +GCT empty cycles = PM_GCT_EMPTY_CYC +GCT empty cycle ratio = PM_GCT_EMPTY_CYC/PM_RUN_CYC +-- +First level of IBM CPI stack. IBM names Stalled Cycles, Waiting to Complete, +Thread Blocked, Completion Table Empty, Other and Completion Cycles. For some +there are no clearly identifiable events, so this group concentrates on +Stalled Cycles (PM_CMPLU_STALL), Thread Blocked (PM_CMPLU_STALL_THRD), +Completion Table Empty (PM_GCT_EMPTY_CYC) and Other (PM_CMPLU_STALL_OTHER_CMPL). diff --git a/collectors/likwid/groups/power8/DATA.txt b/collectors/likwid/groups/power8/DATA.txt new file mode 100644 index 0000000..bc3b893 --- /dev/null +++ b/collectors/likwid/groups/power8/DATA.txt @@ -0,0 +1,23 @@ +SHORT Load to store ratio + +EVENTSET +PMC0 PM_LD_CMPL +PMC1 PM_ST_CMPL +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +Load to store ratio PMC0/PMC1 +Load ratio PMC0/PMC4 +Store ratio PMC1/PMC4 + +LONG +Formulas: +Load to store ratio = PM_LD_CMPL/PM_ST_CMPL +Load ratio = PM_LD_CMPL/PM_RUN_INST_CMPL +Store ratio = PM_ST_CMPL/PM_RUN_INST_CMPL +- +This is a metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/power8/FLOPS_1_2.txt b/collectors/likwid/groups/power8/FLOPS_1_2.txt new file mode 100644 index 0000000..27138d5 --- /dev/null +++ b/collectors/likwid/groups/power8/FLOPS_1_2.txt @@ -0,0 +1,24 @@ +SHORT Group 121 as used in IBM Parallel Environment Developer Edition + +EVENTSET +PMC0 PM_VSU0_1FLOP +PMC1 PM_VSU1_1FLOP +PMC2 PM_VSU0_2FLOP +PMC3 PM_VSU1_2FLOP +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +CPI PMC5/PMC4 +One FLOP ops PMC0+PMC1 +Two FLOPs ops PMC2+PMC3 +[MFLOP/s] 1E-6*(PMC0+PMC1+((PMC2+PMC3)*2))/time + +LONG +Formulas: +CPI = PM_RUN_CYC/PM_RUN_INST_CMPL +One FLOP ops = PM_VSU0_1FLOP+PM_VSU1_1FLOP +Two FLOPs ops = PM_VSU0_2FLOP+PM_VSU1_2FLOP +[MFLOP/s] = 1E-6*(PM_VSU0_1FLOP+PM_VSU1_1FLOP+((PM_VSU0_2FLOP+PM_VSU1_2FLOP)*2))/time +-- +Group 121 from web page http://www.ibm.com/support/knowledgecenter/en/SSFK5S_2.2.0/com.ibm.cluster.pedev.v2r2.pedev100.doc/bl7ug_power8metrics.htm diff --git a/collectors/likwid/groups/power8/FLOPS_4_8.txt b/collectors/likwid/groups/power8/FLOPS_4_8.txt new file mode 100644 index 0000000..70e600a --- /dev/null +++ b/collectors/likwid/groups/power8/FLOPS_4_8.txt @@ -0,0 +1,24 @@ +SHORT Group 122 as used in IBM Parallel Environment Developer Edition + +EVENTSET +PMC0 PM_VSU0_4FLOP +PMC1 PM_VSU1_4FLOP +PMC2 PM_VSU0_8FLOP +PMC3 PM_VSU1_8FLOP +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +CPI PMC5/PMC4 +Four FLOPs ops PMC0+PMC1 +Eight FLOPs ops PMC2+PMC3 +MFLOP/s 1E-6*(((PMC0+PMC1)*4.0)+((PMC2+PMC3)*8.0))/time + +LONG +Formulas: +CPI = PM_RUN_CYC/PM_RUN_INST_CMPL +Four FLOPs ops = PM_VSU0_4FLOP+PM_VSU1_4FLOP +Eight FLOPs ops = PM_VSU0_8FLOP+PM_VSU1_8FLOP +MFLOP/s = 1E-6*(((PM_VSU0_4FLOP+PM_VSU1_4FLOP)*4.0)+((PM_VSU0_8FLOP+PM_VSU1_8FLOP)*8.0))/time +-- +Group 122 from web page http://www.ibm.com/support/knowledgecenter/en/SSFK5S_2.2.0/com.ibm.cluster.pedev.v2r2.pedev100.doc/bl7ug_power8metrics.htm diff --git a/collectors/likwid/groups/power8/FLOPS_DP.txt b/collectors/likwid/groups/power8/FLOPS_DP.txt new file mode 100644 index 0000000..8c3bfdf --- /dev/null +++ b/collectors/likwid/groups/power8/FLOPS_DP.txt @@ -0,0 +1,27 @@ +SHORT Double Precision MFlops/s + +EVENTSET +PMC0 PM_VSU0_DP_2FLOP +PMC1 PM_VSU0_DP_FMA +PMC2 PM_VSU0_DP_FSQRT_FDIV +PMC3 PM_VSU0_SCALAR_DP_ISSUED +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +DP [MFLOP/s] 1.0E-06*((PMC0*2.0)+PMC2+(PMC1*4.0))/time +DP VSX [MFLOP/s] 1.0E-06*((PMC1*4.0)+(PMC0*2.0))/time +Packed [MUOPS/s] 1.0E-06*(PMC1)/time +Scalar [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time + +LONG +Formulas: +CPI = PM_RUN_CYC/PM_RUN_INST_CMPL +DP [MFLOP/s] = 1.0E-06*(PM_VSU0_SCALAR_DP_ISSUED+PM_VSU1_SCALAR_DP_ISSUED+(PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime +DP VSX [MFLOP/s] = 1.0E-06*((PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime +Packed [MUOPS/s] = 1.0E-06*(PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)/runtime +Scalar [MUOPS/s] = 1.0E-06*(PM_VSU0_SCALAR_DP_ISSUED+PM_VSU1_SCALAR_DP_ISSUED)/runtime +-- + diff --git a/collectors/likwid/groups/power8/FLOPS_DP2.txt b/collectors/likwid/groups/power8/FLOPS_DP2.txt new file mode 100644 index 0000000..69ca9e2 --- /dev/null +++ b/collectors/likwid/groups/power8/FLOPS_DP2.txt @@ -0,0 +1,27 @@ +SHORT Double Precision MFlops/s + +EVENTSET +PMC0 PM_VSU1_DP_2FLOP +PMC1 PM_VSU1_DP_FMA +PMC2 PM_VSU1_DP_FSQRT_FDIV +PMC3 PM_VSU1_SCALAR_DP_ISSUED +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +DP [MFLOP/s] 1.0E-06*(PMC0+PMC2+(PMC1)*4.0)/time +DP VSX [MFLOP/s] 1.0E-06*((PMC1)*4.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC1)/time +Scalar [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time + +LONG +Formulas: +CPI = PM_RUN_CYC/PM_RUN_INST_CMPL +DP [MFLOP/s] = 1.0E-06*(PM_VSU0_SCALAR_DP_ISSUED+PM_VSU1_SCALAR_DP_ISSUED+(PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime +DP VSX [MFLOP/s] = 1.0E-06*((PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime +Packed [MUOPS/s] = 1.0E-06*(PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)/runtime +Scalar [MUOPS/s] = 1.0E-06*(PM_VSU0_SCALAR_DP_ISSUED+PM_VSU1_SCALAR_DP_ISSUED)/runtime +-- + diff --git a/collectors/likwid/groups/power8/FLOPS_FMA.txt b/collectors/likwid/groups/power8/FLOPS_FMA.txt new file mode 100644 index 0000000..8bf5234 --- /dev/null +++ b/collectors/likwid/groups/power8/FLOPS_FMA.txt @@ -0,0 +1,28 @@ +SHORT Group 124 as used in IBM Parallel Environment Developer Edition + +EVENTSET +PMC0 PM_VSU0_DP_FMA +PMC1 PM_VSU1_DP_FMA +PMC2 PM_VSU0_FMA +PMC3 PM_VSU1_FMA +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +CPI PMC5/PMC4 +DP FMAs PMC0+PMC1 +Scalar FMAs PMC2+PMC3 +DP FMA [MFLOP/s] 1E-6*(PMC0+PMC1)*4.0/time +Scalar FMA [MFLOP/s] 1E-6*(PMC2+PMC3)*2.0/time +[MFLOP/s] 1E-6*(((PMC0+PMC1)*4.0)+((PMC2+PMC3)*2.0))/time + +LONG +Formulas: +CPI = PM_RUN_CYC/PM_RUN_INST_CMPL +DP FMAs = PM_VSU0_DP_FMA+PM_VSU1_DP_FMA +Scalar FMAs = PM_VSU0_FMA+PM_VSU1_FMA +DP FMA [MFLOP/s] = 1E-6*(PM_VSU0_DP_FMA+PM_VSU1_DP_FMA)*4.0/runtime +Scalar FMA [MFLOP/s] = 1E-6*(PM_VSU0_FMA+PM_VSU1_FMA)*2.0/runtime +[MFLOP/s] = 1E-6*(((PM_VSU0_DP_FMA+PM_VSU1_DP_FMA)*4.0)+((PM_VSU0_FMA+PM_VSU1_FMA)*2.0))/runtime +-- +Group 124 from web page http://www.ibm.com/support/knowledgecenter/en/SSFK5S_2.2.0/com.ibm.cluster.pedev.v2r2.pedev100.doc/bl7ug_power8metrics.htm diff --git a/collectors/likwid/groups/power8/FLOPS_SP.txt b/collectors/likwid/groups/power8/FLOPS_SP.txt new file mode 100644 index 0000000..19bcd5c --- /dev/null +++ b/collectors/likwid/groups/power8/FLOPS_SP.txt @@ -0,0 +1,27 @@ +SHORT Double Precision MFlops/s + +EVENTSET +PMC0 PM_VSU0_SINGLE +PMC1 PM_VSU0_VECTOR_SP_ISSUED +PMC2 PM_VSU1_SINGLE +PMC3 PM_VSU1_VECTOR_SP_ISSUED +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +SP [MFLOP/s] 1.0E-06*(((PMC0-PMC1)+(PMC2-PMC3))*4.0+(PMC1+PMC3)*8.0)/time +SP VSX [MFLOP/s] 1.0E-06*((PMC1+PMC3)*8.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC1+PMC3)/time +Scalar [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time + +LONG +Formulas: +CPI = PM_RUN_CYC/PM_RUN_INST_CMPL +SP [MFLOP/s] = 1.0E-06*(PM_VSU0_SINGLE+PM_VSU1_SINGLE+(PM_VSU0_VECTOR_SP_ISSUED+PM_VSU1_VECTOR_SP_ISSUED)*8)/runtime +SP VSX [MFLOP/s] = 1.0E-06*((PM_VSU0_VECTOR_SP_ISSUED+PM_VSU1_VECTOR_SP_ISSUED)*8)/runtime +Packed [MUOPS/s] = 1.0E-06*(PM_VSU0_VECTOR_SP_ISSUED+PM_VSU1_VECTOR_SP_ISSUED)/runtime +Scalar [MUOPS/s] = 1.0E-06*(PM_VSU0_SINGLE+PM_VSU1_SINGLE)/runtime +-- + diff --git a/collectors/likwid/groups/power8/FLOPS_VSU0.txt b/collectors/likwid/groups/power8/FLOPS_VSU0.txt new file mode 100644 index 0000000..fa94626 --- /dev/null +++ b/collectors/likwid/groups/power8/FLOPS_VSU0.txt @@ -0,0 +1,23 @@ +SHORT Double Precision MFlops/s performed by VSU pipe 0 + +EVENTSET +PMC0 PM_VSU0_DP_2FLOP +PMC1 PM_VSU0_DP_FMA +PMC2 PM_VSU0_DP_FSQRT_FDIV +PMC3 PM_VSU0_1FLOP +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +[MFLOP/s] 1.0E-06*((PMC0*2.0)+(PMC2*8.0)+(PMC1*4.0)+PMC3)/time +VSX [MFLOP/s] 1.0E-06*(PMC1*4.0)/time + +LONG +Formulas: +CPI = PM_RUN_CYC/PM_RUN_INST_CMPL +[MFLOP/s] = 1.0E-06*(PM_VSU0_SCALAR_DP_ISSUED+PM_VSU0_SCALAR_DP_ISSUED+(PM_VSU0_VECTOR_DP_ISSUED+PM_VSU0_VECTOR_DP_ISSUED)*4)/runtime +VSX [MFLOP/s] = 1.0E-06*((PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime +-- + diff --git a/collectors/likwid/groups/power8/FLOPS_VSU1.txt b/collectors/likwid/groups/power8/FLOPS_VSU1.txt new file mode 100644 index 0000000..617ab88 --- /dev/null +++ b/collectors/likwid/groups/power8/FLOPS_VSU1.txt @@ -0,0 +1,22 @@ +SHORT Double Precision MFlops/s performed by VSU pipe 1 + +EVENTSET +PMC0 PM_VSU1_DP_2FLOP +PMC1 PM_VSU1_DP_FMA +PMC2 PM_VSU1_DP_FSQRT_FDIV +PMC3 PM_VSU1_1FLOP +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +[MFLOP/s] 1.0E-06*((PMC0*2.0)+(PMC2*8.0)+(PMC1*4.0)+PMC3)/time +VSX [MFLOP/s] 1.0E-06*(PMC1*4.0)/time + +LONG +Formulas: +CPI = PM_RUN_CYC/PM_RUN_INST_CMPL +[MFLOP/s] = 1.0E-06*(PM_VSU1_SCALAR_DP_ISSUED+PM_VSU1_SCALAR_DP_ISSUED+(PM_VSU1_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime +VSX [MFLOP/s] = 1.0E-06*((PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime +-- diff --git a/collectors/likwid/groups/power8/FLOPS_VSX.txt b/collectors/likwid/groups/power8/FLOPS_VSX.txt new file mode 100644 index 0000000..063ad0c --- /dev/null +++ b/collectors/likwid/groups/power8/FLOPS_VSX.txt @@ -0,0 +1,29 @@ +SHORT Vectorized MFlops/s + +EVENTSET +PMC0 PM_VSU0_VECTOR_DP_ISSUED +PMC1 PM_VSU1_VECTOR_DP_ISSUED +PMC2 PM_VSU0_VECTOR_SP_ISSUED +PMC3 PM_VSU1_VECTOR_SP_ISSUED +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +[MFLOP/s] 1.0E-06*((PMC0+PMC1)*4.0+(PMC2+PMC3)*8.0)/time +DP [MFLOP/s] 1.0E-06*((PMC0+PMC1)*4.0)/time +SP [MFLOP/s] 1.0E-06*((PMC2+PMC3)*8.0)/time +DP [MUOPS/s] 1.0E-06*(PMC0+PMC1)/time +SP [MUOPS/s] 1.0E-06*(PMC2+PMC3)/time + +LONG +Formulas: +CPI = PM_RUN_CYC/PM_RUN_INST_CMPL +[MFLOP/s] = 1.0E-06*((PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4.0+(PM_VSU0_VECTOR_SP_ISSUED+PM_VSU1_VECTOR_SP_ISSUED)*8.0)/runtime +DP [MFLOP/s] = 1.0E-06*((PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4.0)/runtime +SP [MFLOP/s] = 1.0E-06*((PM_VSU0_VECTOR_SP_ISSUED+PM_VSU1_VECTOR_SP_ISSUED)*8.0)/runtime +DP [MUOPS/s] = 1.0E-06*(PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)/runtime +SP [MUOPS/s] = 1.0E-06*(PM_VSU0_VECTOR_SP_ISSUED+PM_VSU1_VECTOR_SP_ISSUED)/runtime +-- + diff --git a/collectors/likwid/groups/power8/ICACHE.txt b/collectors/likwid/groups/power8/ICACHE.txt new file mode 100644 index 0000000..7a07fd4 --- /dev/null +++ b/collectors/likwid/groups/power8/ICACHE.txt @@ -0,0 +1,22 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +PMC0 PM_INST_FROM_L1 +PMC1 PM_L1_ICACHE_MISS +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +L1I request rate PMC0/PMC4 +L1I miss rate PMC1/PMC4 +L1I miss ratio PMC1/PMC0 + +LONG +Formulas: +L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY +L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY +L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES +- +This group measures some L1 instruction cache metrics. diff --git a/collectors/likwid/groups/power8/L1.txt b/collectors/likwid/groups/power8/L1.txt new file mode 100644 index 0000000..19dc36e --- /dev/null +++ b/collectors/likwid/groups/power8/L1.txt @@ -0,0 +1,33 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +PMC0 PM_LD_REF_L1 +PMC1 PM_ST_CMPL +PMC2 PM_LSU_L1_PREF +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +L2D load bandwidth [MBytes/s] 1.0E-06*((PMC0+PMC2)/2)*64.0/time +L2D load data volume [GBytes] 1.0E-09*((PMC0+PMC2)/2)*64.0 +L2D store bandwidth [MBytes/s] 1.0E-06*((PMC1/2))*64.0/time +L2D store data volume [GBytes] 1.0E-09*((PMC1/2))*64.0 +L2 bandwidth [MBytes/s] 1.0E-06*((PMC1+PMC0+PMC2)/2)*64.0/time +L2 data volume [GBytes] 1.0E-09*((PMC1+PMC0+PMC2)/2)*64.0 + +LONG +Formulas: +CPI = PM_RUN_CYC/PM_RUN_INST_CMPL +L2D load bandwidth [MBytes/s] = 1.0E-06*(PM_DATA_FROM_L2/2)*128.0/time +L2D load data volume [GBytes] = 1.0E-09*(PM_DATA_FROM_L2/2)*128.0 +L2D store bandwidth [MBytes/s] = 1.0E-06*(PM_ST_MISS_L1)*128.0/time +L2D store data volume [GBytes] = 1.0E-09*(PM_ST_MISS_L1)*128.0 +L2 bandwidth [MBytes/s] = 1.0E-06*(PM_DATA_FROM_L2/2 + PM_ST_MISS_L1)*128.0/time +L2 data volume [GBytes] = 1.0E-09*(PM_DATA_FROM_L2/2 + PM_ST_MISS_L1)*128.0 +- +Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the +number of cacheline loaded from the L2 to the L1 data cache. There is currently no +event to get the evicted data volume. diff --git a/collectors/likwid/groups/power8/L2.txt b/collectors/likwid/groups/power8/L2.txt new file mode 100644 index 0000000..d5af584 --- /dev/null +++ b/collectors/likwid/groups/power8/L2.txt @@ -0,0 +1,32 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +PMC0 PM_L2_ST +PMC2 PM_LD_MISS_L1 +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +L2D load bandwidth [MBytes/s] 1.0E-06*(PMC2/2)*128.0/time +L2D load data volume [GBytes] 1.0E-09*(PMC2/2)*128.0 +L2D store bandwidth [MBytes/s] 1.0E-06*(PMC0/2)*128.0/time +L2D store data volume [GBytes] 1.0E-09*(PMC0/2)*128.0 +L2 bandwidth [MBytes/s] 1.0E-06*((PMC0+PMC2)/2)*128.0/time +L2 data volume [GBytes] 1.0E-09*((PMC0+PMC2)/2)*128.0 + +LONG +Formulas: +CPI = PM_RUN_CYC/PM_RUN_INST_CMPL +L2D load bandwidth [MBytes/s] = 1.0E-06*(PM_DATA_FROM_L2/2)*128.0/time +L2D load data volume [GBytes] = 1.0E-09*(PM_DATA_FROM_L2/2)*128.0 +L2D store bandwidth [MBytes/s] = 1.0E-06*(PM_ST_CMPL/2)*128.0/time +L2D store data volume [GBytes] = 1.0E-09*(PM_ST_CMPL/2)*128.0 +L2 bandwidth [MBytes/s] = 1.0E-06*((PM_DATA_FROM_L2 + PM_ST_CMPL))*128.0/time +L2 data volume [GBytes] = 1.0E-09*((PM_DATA_FROM_L2 + PM_ST_CMPL))*128.0 +- +Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the +number of cacheline loaded from the L2 to the L1 data cache. There is currently no +event to get the evicted data volume. diff --git a/collectors/likwid/groups/power8/L2CACHE.txt b/collectors/likwid/groups/power8/L2CACHE.txt new file mode 100644 index 0000000..47bcedd --- /dev/null +++ b/collectors/likwid/groups/power8/L2CACHE.txt @@ -0,0 +1,40 @@ +SHORT L2 cache miss rate/ratio + +EVENTSET +PMC0 PM_L2_ST_MISS +PMC1 PM_L2_LD_MISS +PMC2 PM_L2_LD_DISP +PMC3 PM_L2_ST_DISP +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +L2 request rate = (PMC2+PMC3)/PMC4 +L2 miss rate = (PMC0+PMC1)/PMC4 +L2 miss ratio = (PMC0+PMC1)/(PMC2+PMC3) + +LONG +Formulas: +L2 request rate = (PM_L2_LD_DISP+PM_L2_ST_DISP)/PM_RUN_INST_CMPL +L2 miss rate = (PM_L2_LD_MISS+PM_L2_ST_MISS)/PM_RUN_INST_CMPL +L2 miss ratio = (PM_L2_LD_MISS+PM_L2_ST_MISS)/(PM_L2_LD_DISP+PM_L2_ST_DISP) +L2 load request rate = PM_L2_LD_DISP/PM_RUN_INST_CMPL +L2 store request rate = PM_L2_ST_DISP/PM_RUN_INST_CMPL +L2 load miss rate = PM_L2_LD_MISS/PM_RUN_INST_CMPL +L2 store miss rate = PM_L2_ST_DISP/PM_RUN_INST_CMPL +L2 load miss ratio = PM_L2_LD_MISS/(PM_L2_LD_DISP+PM_L2_ST_DISP) +L2 store miss ratio = PM_L2_ST_MISS/(PM_L2_LD_DISP+PM_L2_ST_DISP) +- +This group measures the locality of your data accesses with regard to the +L2 Cache. L2 request rate tells you how data intensive your code is +or how many Data accesses you have in average per instruction. +The L2 miss rate gives a measure how often it was necessary to get +cachelines from memory. And finally L2 miss ratio tells you how many of your +memory references required a cacheline to be loaded from a higher level. +While the Data cache miss rate might be given by your algorithm you should +try to get Data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/power8/L3.txt b/collectors/likwid/groups/power8/L3.txt new file mode 100644 index 0000000..0737c44 --- /dev/null +++ b/collectors/likwid/groups/power8/L3.txt @@ -0,0 +1,31 @@ +SHORT L3 cache bandwidth in MBytes/s + +EVENTSET +PMC0 PM_L3_LD_PREF +PMC3 PM_DATA_FROM_L3 +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +L3D load bandwidth [MBytes/s] 1.0E-06*(PMC3+(PMC0-PMC3))*128.0/time +L3D load data volume [GBytes] 1.0E-09*(PMC3+(PMC0-PMC3))*128.0 +L3 bandwidth [MBytes/s] 1.0E-06*(PMC3+(PMC0-PMC3))*128.0/time +L3 data volume [GBytes] 1.0E-09*(PMC3+(PMC0-PMC3))*128.0 +Loads from local L3 per cycle 100.0*(PMC3+(PMC0-PMC3))/PMC5 + +LONG +Formulas: +CPI = PM_RUN_CYC/PM_RUN_INST_CMPL +L3D load bandwidth [MBytes/s] = 1.0E-06*(PM_DATA_FROM_L3)*128.0/time +L3D load data volume [GBytes] = 1.0E-09*(PM_DATA_FROM_L3)*128.0 +L3D evict bandwidth [MBytes/s] = 1.0E-06*(PM_L2_CASTOUT_MOD)*128.0/time +L3D evict data volume [GBytes] = 1.0E-09*(PM_L2_CASTOUT_MOD)*128.0 +L3 bandwidth [MBytes/s] = 1.0E-06*(PM_DATA_FROM_L3+PM_L2_CASTOUT_MOD)*128.0/time +L3 data volume [GBytes] = 1.0E-09*(PM_DATA_FROM_L3+PM_L2_CASTOUT_MOD)*128.0 +- +Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the +number of cacheline loaded from the L3 to the L2 data cache. There is currently no +event to get the evicted data volume. diff --git a/collectors/likwid/groups/power8/MEM.txt b/collectors/likwid/groups/power8/MEM.txt new file mode 100644 index 0000000..4831c80 --- /dev/null +++ b/collectors/likwid/groups/power8/MEM.txt @@ -0,0 +1,30 @@ +SHORT Main memory bandwidth in MBytes/s + +EVENTSET +PMC0 PM_L3_CO_MEPF +PMC1 PM_DATA_ALL_FROM_MEMORY +PMC3 PM_L3_PF_ON_CHIP_MEM +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +Memory load bandwidth [MBytes/s] 1.0E-06*(PMC1+PMC3)*128.0/time +Memory load data volume [GBytes] 1.0E-09*(PMC1+PMC3)*128.0 +Memory evict bandwidth [MBytes/s] 1.0E-06*(PMC0)*128.0/time +Memory evict data volume [GBytes] 1.0E-09*(PMC0)*128.0 +Memory bandwidth [MBytes/s] 1.0E-06*(PMC1+PMC3+PMC0)*128.0/time +Memory data volume [GBytes] 1.0E-09*(PMC1+PMC3+PMC0)*128.0 + +LONG +Formulas: +CPI = PM_RUN_CYC / PM_RUN_INST_CMPL +Memory load bandwidth [MBytes/s] = 1.0E-06* (PM_DATA_ALL_FROM_MEMORY)*128/time +Memory load data volume [GBytes] = 1.0E-09* (PM_DATA_ALL_FROM_MEMORY)*128 +Memory evict bandwidth [MBytes/s] = 1.0E-06* (PM_MEM_CO)*128/time +Memory evict data volume [GBytes] = 1.0E-09* (PM_MEM_CO)*128 +Memory bandwidth [MBytes/s] = 1.0E-06* (PM_DATA_ALL_FROM_MEMORY+PM_MEM_CO)*128/time +Memory data volume [GBytes] = 1.0E-09* (PM_DATA_ALL_FROM_MEMORY+PM_MEM_CO)*128 +-- +This group uses the core-local events to measure data traffic from memory. diff --git a/collectors/likwid/groups/power8/NUMA.txt b/collectors/likwid/groups/power8/NUMA.txt new file mode 100644 index 0000000..e2a6e71 --- /dev/null +++ b/collectors/likwid/groups/power8/NUMA.txt @@ -0,0 +1,29 @@ +SHORT Memory bandwidth in MBytes/s for local and remote memory + +EVENTSET +PMC1 PM_DATA_ALL_FROM_LMEM +PMC3 PM_DATA_ALL_FROM_DMEM +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +Local bandwidth [MBytes/s] 1.0E-06*(PMC1)*128.0/time +Local data volume [GBytes] 1.0E-09*(PMC1)*128.0 +Remote bandwidth [MBytes/s] 1.0E-06*(PMC3)*128.0/time +Remote data volume [GBytes] 1.0E-09*(PMC3)*128.0 +Memory load bandwidth [MBytes/s] 1.0E-06*(PMC1+PMC3)*128.0/time +Memory load data volume [GBytes] 1.0E-09*(PMC1+PMC3)*128.0 + +LONG +Formulas: +CPI = PM_RUN_CYC / PM_RUN_INST_CMPL +Memory load bandwidth [MBytes/s] = 1.0E-06* (PM_DATA_ALL_FROM_MEMORY)*128/time +Memory load data volume [GBytes] = 1.0E-09* (PM_DATA_ALL_FROM_MEMORY)*128 +Memory evict bandwidth [MBytes/s] = 1.0E-06* (PM_MEM_CO)*128/time +Memory evict data volume [GBytes] = 1.0E-09* (PM_MEM_CO)*128 +Memory bandwidth [MBytes/s] = 1.0E-06* (PM_DATA_ALL_FROM_MEMORY+PM_MEM_CO)*128/time +Memory data volume [GBytes] = 1.0E-09* (PM_DATA_ALL_FROM_MEMORY+PM_MEM_CO)*128 +-- +This group measures the NUMA traffic by separating local from remote memory data transfers. diff --git a/collectors/likwid/groups/power8/STALLS1.txt b/collectors/likwid/groups/power8/STALLS1.txt new file mode 100644 index 0000000..6acf949 --- /dev/null +++ b/collectors/likwid/groups/power8/STALLS1.txt @@ -0,0 +1,33 @@ +SHORT Completion stalls (group 1) + +EVENTSET +PMC0 PM_CMPLU_STALL_THRD +PMC1 PM_CMPLU_STALL_DCACHE_MISS +PMC2 PM_CMPLU_STALL_COQ_FULL +PMC3 PM_CMPLU_STALL +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +Runtime time +CPI PMC5/PMC4 +Completion stall cycles PMC3 +Stall cycles by thread conflict PMC0 +Stall ratio by thread conflict [%] PMC0/PMC3*100.0 +Stall cycles by d-cache miss PMC1 +Stall ratio by d-cache miss [%] PMC1/PMC3*100.0 +Stall cycles by full castout queue PMC2 +Stall ratio by full castout queue [%] PMC2/PMC3*100.0 + + +LONG +Formulas: +CPI = PM_RUN_CYC / PM_RUN_INST_CMPL +Completion stall cycles = PM_CMPLU_STALL +Stall cycles by thread conflict = PM_CMPLU_STALL_THRD +Stall ratio by thread conflict [%] = PM_CMPLU_STALL_THRD/PM_CMPLU_STALL*100 +Stall cycles by d-cache miss = PM_CMPLU_STALL_DCACHE_MISS +Stall ratio by d-cache miss [%] = PM_CMPLU_STALL_DCACHE_MISS/PM_CMPLU_STALL*100 +Stall cycles by full castout queue = PM_CMPLU_STALL_COQ_FULL +Stall ratio by full castout queue [%] = PM_CMPLU_STALL_COQ_FULL/PM_CMPLU_STALL*100 +-- diff --git a/collectors/likwid/groups/power8/STALLS2.txt b/collectors/likwid/groups/power8/STALLS2.txt new file mode 100644 index 0000000..6329624 --- /dev/null +++ b/collectors/likwid/groups/power8/STALLS2.txt @@ -0,0 +1,32 @@ +SHORT Completion stalls (group 2) + +EVENTSET +PMC0 PM_CMPLU_STALL +PMC1 PM_CMPLU_STALL_LSU +PMC2 PM_CMPLU_STALL_FLUSH +PMC3 PM_CMPLU_STALL_BRU +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +CPI PMC5/PMC4 +Stall cycles PMC0 +Stall cycles by load/store unit PMC1 +Stall ratio by load/store unit [%] PMC1/PMC0*100.0 +Stall cycles by pipeline flush PMC2 +Stall ratio by pipeline flush [%] PMC2/PMC0*100.0 +Stall cycles by branch unit PMC3 +Stall ratio by branch unit [%] PMC3/PMC0*100.0 + + +LONG +Formulas: +CPI = PM_RUN_CYC / PM_RUN_INST_CMPL +Stall cycles = PM_CMPLU_STALL +Stall cycles by load/store unit = PM_CMPLU_STALL_LSU +Stall ratio by load/store unit [%] = PM_CMPLU_STALL_LSU/PM_CMPLU_STALL*100.0 +Stall cycles by pipeline flush = PM_CMPLU_STALL_FLUSH +Stall ratio by pipeline flush [%] = PM_CMPLU_STALL_FLUSH/PM_CMPLU_STALL*100.0 +Stall cycles by branch unit = PM_CMPLU_STALL_BRU +Stall ratio by branch unit [%] = PM_CMPLU_STALL_BRU/PM_CMPLU_STALL*100.0 +-- diff --git a/collectors/likwid/groups/power8/TLB_DATA.txt b/collectors/likwid/groups/power8/TLB_DATA.txt new file mode 100644 index 0000000..c7df459 --- /dev/null +++ b/collectors/likwid/groups/power8/TLB_DATA.txt @@ -0,0 +1,37 @@ +SHORT L1 Data TLB miss rate/ratio + +EVENTSET +PMC0 PM_DTLB_MISS_16G +PMC1 PM_DTLB_MISS_4K +PMC2 PM_DTLB_MISS_64K +PMC3 PM_DTLB_MISS_16M +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +L1 DTLB 4K misses PMC1 +L1 DTLB 4K miss rate PMC1/PMC4 +L1 DTLB 64K misses PMC2 +L1 DTLB 64K miss rate PMC2/PMC4 +L1 DTLB 16M misses PMC3 +L1 DTLB 16M miss rate PMC3/PMC4 +L1 DTLB 16G misses PMC0 +L1 DTLB 16G miss rate PMC0/PMC4 + +LONG +Formulas: +CPI = PM_RUN_CYC / PM_RUN_INST_CMPL +L1 DTLB 4K misses = PM_DTLB_MISS_4K +L1 DTLB 4K miss rate = PM_DTLB_MISS_4K/PM_RUN_INST_CMPL +L1 DTLB 64K misses = PM_DTLB_MISS_64K +L1 DTLB 64K miss rate = PM_DTLB_MISS_64K/PM_RUN_INST_CMPL +L1 DTLB 16M misses = PM_DTLB_MISS_16M +L1 DTLB 16M miss rate = PM_DTLB_MISS_16M/PM_RUN_INST_CMPL +L1 DTLB 16G misses = PM_DTLB_MISS_16G +L1 DTLB 16G miss rate = PM_DTLB_MISS_16G/PM_RUN_INST_CMPL +-- +The DTLB load and store miss rates gives a measure how often a TLB miss occured +per instruction. + diff --git a/collectors/likwid/groups/power8/TLB_INSTR.txt b/collectors/likwid/groups/power8/TLB_INSTR.txt new file mode 100644 index 0000000..3f8b79c --- /dev/null +++ b/collectors/likwid/groups/power8/TLB_INSTR.txt @@ -0,0 +1,21 @@ +SHORT L1 Instruction TLB miss rate/ratio + +EVENTSET +PMC2 PM_ITLB_MISS +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +L1 ITLB misses PMC2 +L1 ITLB miss rate PMC2/PMC4 + +LONG +Formulas: +CPI = PM_RUN_CYC / PM_RUN_INST_CMPL +L1 ITLB misses = PM_ITLB_MISS +L1 ITLB miss rate = PM_ITLB_MISS/PM_RUN_INST_CMPL +-- +The ITLB miss rate gives a measure how often a TLB miss occured per instruction. + diff --git a/collectors/likwid/groups/power8/USEFUL.txt b/collectors/likwid/groups/power8/USEFUL.txt new file mode 100644 index 0000000..0b5fc8f --- /dev/null +++ b/collectors/likwid/groups/power8/USEFUL.txt @@ -0,0 +1,24 @@ +SHORT Rate of useful instructions + +EVENTSET +PMC0 PM_IOPS_CMPL +PMC1 PM_INST_DISP +PMC2 PM_IOPS_DISP +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +CPI PMC5/PMC4 +Useful instr. rate PMC4/PMC1*100.0 +Useful uops rate PMC0/PMC2*100.0 + + +LONG +Formulas: +CPI = PM_RUN_CYC / PM_RUN_INST_CMPL +Useful instr. rate = PM_RUN_INST_CMPL/PM_INST_DISP*100.0 +Useful uops rate = PM_IOPS_CMPL/PM_IOPS_DISP*100.0 +-- +This groups measures how many of the dispatched instructions and internal operations (uops) are +acutally completed. These metrics show the speculatively dispatches instructions compared to the +completed instructions. diff --git a/collectors/likwid/groups/power9/BRANCH.txt b/collectors/likwid/groups/power9/BRANCH.txt new file mode 100644 index 0000000..1f6dd0d --- /dev/null +++ b/collectors/likwid/groups/power9/BRANCH.txt @@ -0,0 +1,30 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +PMC1 PM_BR_PRED +PMC2 PM_IOPS_CMPL +PMC3 PM_BR_MPRED_CMPL +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +Branch rate (PMC1)/PMC4 +Branch misprediction rate PMC3/PMC4 +Branch misprediction ratio PMC3/(PMC1) +Instructions per branch PMC4/(PMC1) +Operations per branch PMC2/PMC1 + +LONG +Formulas: +Branch rate = PM_BR_PRED/PM_RUN_INST_CMPL +Branch misprediction rate = PM_BR_MPRED_CMPL/PM_RUN_INST_CMPL +Branch misprediction ratio = PM_BR_MPRED_CMPL/PM_BR_PRED +Instructions per branch = PM_RUN_INST_CMPL/PM_BR_PRED +- +The rates state how often in average a branch or a mispredicted branch occured +per instruction retired in total. The Branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/Branch rate. + diff --git a/collectors/likwid/groups/power9/DATA.txt b/collectors/likwid/groups/power9/DATA.txt new file mode 100644 index 0000000..a8a7cae --- /dev/null +++ b/collectors/likwid/groups/power9/DATA.txt @@ -0,0 +1,23 @@ +SHORT Load to store ratio + +EVENTSET +PMC3 PM_LD_CMPL +PMC1 PM_ST_CMPL +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +Load to store ratio PMC3/PMC1 +Load rate PMC3/PMC4 +Store rate PMC1/PMC4 + +LONG +Formulas: +Load to store ratio = PM_LD_CMPL/PM_ST_CMPL +Load ratio = PM_LD_CMPL/PM_RUN_INST_CMPL +Store ratio = PM_ST_CMPL/PM_RUN_INST_CMPL +- +This is a metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/power9/FLOPS.txt b/collectors/likwid/groups/power9/FLOPS.txt new file mode 100644 index 0000000..ffaf11f --- /dev/null +++ b/collectors/likwid/groups/power9/FLOPS.txt @@ -0,0 +1,25 @@ +SHORT SP/DP scalar/vector MFlops/s + +EVENTSET +PMC3 PM_FLOP_CMPL +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +SP/DP [MFLOP/s] (scalar assumed) 1.0E-06*PMC3*2.0/time +SP [MFLOP/s] (vector assumed) 1.0E-06*PMC3*8.0/time +DP [MFLOP/s] (vector assumed) 1.0E-06*PMC3*4.0/time + +LONG +Formulas: +CPI = PM_RUN_CYC/PM_RUN_INST_CMPL +SP/DP [MFLOP/s] (scalar assumed) = 1.0E-06*PM_FLOP_CMPL*2.0/runtime +SP [MFLOP/s] (vector assumed) = 1.0E-06*PM_FLOP_CMPL*8.0/runtime +DP [MFLOP/s] (vector assumed) = 1.0E-06*PM_FLOP_CMPL*4.0/runtime +-- +This group counts floating-point operations. All is derived out of a +single event PM_FLOP_CMPL, so if you have mixed usage of SP or DP and +scalar and vector operations, the count won't be exact. With pure codes +the counts are pretty accurate (e.g. when using likwid-bench). diff --git a/collectors/likwid/groups/power9/FLOPS_FMA.txt b/collectors/likwid/groups/power9/FLOPS_FMA.txt new file mode 100644 index 0000000..65e9b3b --- /dev/null +++ b/collectors/likwid/groups/power9/FLOPS_FMA.txt @@ -0,0 +1,21 @@ +SHORT Floating-point operations with scalar FMA instuctions + +EVENTSET +PMC3 PM_FMA_CMPL +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +Scalar FMAs PMC3 +Scalar FMA [MFLOP/s] 1E-6*(PMC3)*2.0/time + +LONG +Formulas: +Scalar FMAs = PM_FMA_CMPL +Scalar FMA [MFLOP/s] = 1E-6*(PM_FMA_CMPL)*2.0/runtime +-- +This groups counts scalar FMA operations. +PM_FMA_CMPL: Two-flops instruction completed (fmadd, fnmadd, fmsub, +fnmsub). Scalar instructions only. diff --git a/collectors/likwid/groups/power9/FLOPS_VSX.txt b/collectors/likwid/groups/power9/FLOPS_VSX.txt new file mode 100644 index 0000000..594adf0 --- /dev/null +++ b/collectors/likwid/groups/power9/FLOPS_VSX.txt @@ -0,0 +1,23 @@ +SHORT Vectorized MFlops/s + +EVENTSET +PMC1 PM_VSU_FIN +PMC3 PM_VECTOR_FLOP_CMPL +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +SP [MFLOP/s] (assumed) 1.0E-06*(PMC3*8.0)/time +DP [MFLOP/s] (assumed) 1.0E-06*(PMC3*4.0)/time +Vector MIOPS/s 1.0E-06*(PMC1)/time + +LONG +Formulas: +CPI = PM_RUN_CYC/PM_RUN_INST_CMPL +SP [MFLOP/s] (assumed) = 1.0E-06*(PM_VECTOR_FLOP_CMPL*4)/runtime +DP [MFLOP/s] (assumed) = 1.0E-06*(PM_VECTOR_FLOP_CMPL*8)/runtime +Vector MIOPS/s = 1.0E-06*(PM_VECTOR_FLOP_CMPL)/runtime +-- +This group measures vector operations. There is no differentiation between SP and DP possible. diff --git a/collectors/likwid/groups/power9/ICACHE.txt b/collectors/likwid/groups/power9/ICACHE.txt new file mode 100644 index 0000000..7a07fd4 --- /dev/null +++ b/collectors/likwid/groups/power9/ICACHE.txt @@ -0,0 +1,22 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +PMC0 PM_INST_FROM_L1 +PMC1 PM_L1_ICACHE_MISS +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +L1I request rate PMC0/PMC4 +L1I miss rate PMC1/PMC4 +L1I miss ratio PMC1/PMC0 + +LONG +Formulas: +L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY +L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY +L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES +- +This group measures some L1 instruction cache metrics. diff --git a/collectors/likwid/groups/power9/L2CACHE.txt b/collectors/likwid/groups/power9/L2CACHE.txt new file mode 100644 index 0000000..9873251 --- /dev/null +++ b/collectors/likwid/groups/power9/L2CACHE.txt @@ -0,0 +1,33 @@ +SHORT L2 cache miss rate/ratio + +EVENTSET +PMC1 PM_L2_LD_MISS +PMC2 PM_L2_LD_DISP +PMC3 PM_L2_ST_DISP +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +L2 request rate (PMC2+PMC3)/PMC4 +L2 load miss rate PMC1/PMC4 +L2 load miss ratio PMC1/(PMC2+PMC3) + +LONG +Formulas: +L2 request rate = (PM_L2_LD_DISP+PM_L2_ST_DISP)/PM_RUN_INST_CMPL +L2 load miss rate = (PM_L2_LD_MISS)/PM_RUN_INST_CMPL +L2 load miss ratio = (PM_L2_LD_MISS)/(PM_L2_LD_DISP+PM_L2_ST_DISP) +- +This group measures the locality of your data accesses with regard to the +L2 Cache. L2 request rate tells you how data intensive your code is +or how many data accesses you have in average per instruction. +The L2 miss rate gives a measure how often it was necessary to get +cachelines from memory. And finally L2 load miss ratio tells you how many of your +memory references required a cacheline to be loaded from a higher level. +While the data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/power9/L2LOAD.txt b/collectors/likwid/groups/power9/L2LOAD.txt new file mode 100644 index 0000000..ebac5a2 --- /dev/null +++ b/collectors/likwid/groups/power9/L2LOAD.txt @@ -0,0 +1,23 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +PMC0 PM_L2_LD +PMC2 PM_L2_INST +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +L2 load bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC2)*128.0/time +L2 load data volume [GBytes] 1.0E-09*(PMC0+PMC2)*128.0 + +LONG +Formulas: +CPI = PM_RUN_CYC/PM_RUN_INST_CMPL +L2 load bandwidth [MBytes/s] = 1.0E-06*(PM_L2_LD+PM_L2_INST)*128.0/time +L2 load data volume [GBytes] = 1.0E-09*(PM_L2_LD+PM_L2_INST)*128.0 +- +Profiling group to measure L2 load cache bandwidth. The bandwidth is computed by the +number of cacheline loaded from L2 cache to L1. diff --git a/collectors/likwid/groups/power9/L2STORE.txt b/collectors/likwid/groups/power9/L2STORE.txt new file mode 100644 index 0000000..3b1c0af --- /dev/null +++ b/collectors/likwid/groups/power9/L2STORE.txt @@ -0,0 +1,22 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +PMC0 PM_L2_ST +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +L2 store bandwidth [MBytes/s] 1.0E-06*(PMC0)*128.0/time +L2 store data volume [GBytes] 1.0E-09*(PMC0)*128.0 + +LONG +Formulas: +CPI = PM_RUN_CYC/PM_RUN_INST_CMPL +L2 load bandwidth [MBytes/s] = 1.0E-06*(PM_L2_ST)*128.0/time +L2 load data volume [GBytes] = 1.0E-09*(PM_L2_ST)*128.0 +- +Profiling group to measure L2 store cache bandwidth. The bandwidth is computed by the +number of cacheline stored from L1 cache to L2. diff --git a/collectors/likwid/groups/power9/L3.txt b/collectors/likwid/groups/power9/L3.txt new file mode 100644 index 0000000..cb97ead --- /dev/null +++ b/collectors/likwid/groups/power9/L3.txt @@ -0,0 +1,29 @@ +SHORT L3 cache bandwidth in MBytes/s + +EVENTSET +PMC0 PM_L3_LD_PREF +PMC3 PM_DATA_FROM_L3 +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +L3D load bandwidth [MBytes/s] 1.0E-06*(PMC3+PMC0)*128.0/time +L3D load data volume [GBytes] 1.0E-09*(PMC3+PMC0)*128.0 +Loads from local L3 per cycle 100.0*(PMC3+PMC0)/PMC5 + +LONG +Formulas: +CPI = PM_RUN_CYC/PM_RUN_INST_CMPL +L3D load bandwidth [MBytes/s] = 1.0E-06*(PM_DATA_FROM_L3)*128.0/time +L3D load data volume [GBytes] = 1.0E-09*(PM_DATA_FROM_L3)*128.0 +L3D evict bandwidth [MBytes/s] = 1.0E-06*(PM_L2_CASTOUT_MOD)*128.0/time +L3D evict data volume [GBytes] = 1.0E-09*(PM_L2_CASTOUT_MOD)*128.0 +L3 bandwidth [MBytes/s] = 1.0E-06*(PM_DATA_FROM_L3+PM_L2_CASTOUT_MOD)*128.0/time +L3 data volume [GBytes] = 1.0E-09*(PM_DATA_FROM_L3+PM_L2_CASTOUT_MOD)*128.0 +- +Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the +number of cacheline loaded from the L3 to the L2 data cache. There is currently no +event to get the evicted data volume. diff --git a/collectors/likwid/groups/power9/MEM.txt b/collectors/likwid/groups/power9/MEM.txt new file mode 100644 index 0000000..022d39d --- /dev/null +++ b/collectors/likwid/groups/power9/MEM.txt @@ -0,0 +1,47 @@ +SHORT Main memory bandwidth in MBytes/s + +EVENTSET +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC +MBOX0C0 PM_MBA0_READ_BYTES +MBOX0C1 PM_MBA0_WRITE_BYTES +MBOX1C0 PM_MBA1_READ_BYTES +MBOX1C1 PM_MBA1_WRITE_BYTES +MBOX2C0 PM_MBA2_READ_BYTES +MBOX2C1 PM_MBA2_WRITE_BYTES +MBOX3C0 PM_MBA3_READ_BYTES +MBOX3C1 PM_MBA3_WRITE_BYTES +MBOX4C0 PM_MBA4_READ_BYTES +MBOX4C1 PM_MBA4_WRITE_BYTES +MBOX5C0 PM_MBA5_READ_BYTES +MBOX5C1 PM_MBA5_WRITE_BYTES +MBOX6C0 PM_MBA6_READ_BYTES +MBOX6C1 PM_MBA6_WRITE_BYTES +MBOX7C0 PM_MBA7_READ_BYTES +MBOX7C1 PM_MBA7_WRITE_BYTES + + + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 + +LONG +Formulas: +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(PM_MBAx_READ_BYTES))*64.0/runtime +Memory read data volume [GBytes] = 1.0E-09*(SUM(PM_MBAx_READ_BYTES))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(PM_MBAx_WRITE_BYTES))*64.0/runtime +Memory write data volume [GBytes] = 1.0E-09*(SUM(PM_MBAx_WRITE_BYTES))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(PM_MBAx_READ_BYTES)+SUM(PM_MBAx_WRITE_BYTES))*64.0/runtime +Memory data volume [GBytes] = 1.0E-09*(SUM(PM_MBAx_READ_BYTES)+SUM(PM_MBAx_WRITE_BYTES))*64.0 +- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Since this group is based on Uncore events it is only possible to measure on a +per socket base. Some of the counters may not be available on your system. +Also outputs total data volume transferred from main memory. diff --git a/collectors/likwid/groups/power9/TLB_DATA.txt b/collectors/likwid/groups/power9/TLB_DATA.txt new file mode 100644 index 0000000..3d77654 --- /dev/null +++ b/collectors/likwid/groups/power9/TLB_DATA.txt @@ -0,0 +1,42 @@ +SHORT L1 Data TLB miss rate/ratio + +EVENTSET +PMC0 PM_LSU_DTLB_MISS_16G_1G +PMC1 PM_LSU_DTLB_MISS_4K +PMC2 PM_LSU_DTLB_MISS_64K +PMC3 PM_LSU_DTLB_MISS_16M_2M +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +L1 DTLB 4K misses PMC1 +L1 DTLB 4K miss rate PMC1/PMC4 +L1 DTLB 4K miss ratio [%] (PMC1/(PMC0+PMC1+PMC2+PMC3))*100.0 +L1 DTLB 64K misses PMC2 +L1 DTLB 64K miss rate PMC2/PMC4 +L1 DTLB 64K miss ratio [%] (PMC2/(PMC0+PMC1+PMC2+PMC3))*100.0 +L1 DTLB 16M/2M misses PMC3 +L1 DTLB 16M/2M miss rate PMC3/PMC4 +L1 DTLB 16M/2M miss ratio [%] (PMC3/(PMC0+PMC1+PMC2+PMC3))*100.0 +L1 DTLB 16G/1G misses PMC0 +L1 DTLB 16G/1G miss rate PMC0/PMC4 +L1 DTLB 16G/1G miss ratio [%] (PMC0/(PMC0+PMC1+PMC2+PMC3))*100.0 + +LONG +Formulas: +L1 DTLB 4K misses = PM_LSU_DTLB_MISS_4K +L1 DTLB 4K miss rate = PM_LSU_DTLB_MISS_4K/PM_RUN_INST_CMPL +L1 DTLB 4K miss ratio [%] = (PM_LSU_DTLB_MISS_4K/(PM_LSU_DTLB_MISS_4K+PM_DTLB_MISS_64K+PM_DTLB_MISS_16M_2M+PM_DTLB_MISS_16G_1G))*100 +L1 DTLB 64K misses = PM_LSU_DTLB_MISS_64K +L1 DTLB 64K miss rate = PM_LSU_DTLB_MISS_64K/PM_RUN_INST_CMPL +L1 DTLB 64K miss ratio [%] = (PM_LSU_DTLB_MISS_64K/(PM_LSU_DTLB_MISS_4K+PM_DTLB_MISS_64K+PM_DTLB_MISS_16M_2M+PM_DTLB_MISS_16G_1G))*100 +L1 DTLB 4K misses = PM_LSU_DTLB_MISS_4K +L1 DTLB 4K miss rate = PM_LSU_DTLB_MISS_4K/PM_RUN_INST_CMPL +L1 DTLB 4K miss ratio [%] = (PM_LSU_DTLB_MISS_4K/(PM_LSU_DTLB_MISS_4K+PM_DTLB_MISS_64K+PM_DTLB_MISS_16M_2M+PM_DTLB_MISS_16G_1G))*100 +L1 DTLB 4K misses = PM_LSU_DTLB_MISS_4K +L1 DTLB 4K miss rate = PM_LSU_DTLB_MISS_4K/PM_RUN_INST_CMPL +L1 DTLB 4K miss ratio [%] = (PM_LSU_DTLB_MISS_4K/(PM_LSU_DTLB_MISS_4K+PM_DTLB_MISS_64K+PM_DTLB_MISS_16M_2M+PM_DTLB_MISS_16G_1G))*100 +- +This group measures the data TLB misses for different page sizes. diff --git a/collectors/likwid/groups/power9/TLB_INSTR.txt b/collectors/likwid/groups/power9/TLB_INSTR.txt new file mode 100644 index 0000000..dc99d8a --- /dev/null +++ b/collectors/likwid/groups/power9/TLB_INSTR.txt @@ -0,0 +1,21 @@ +SHORT L1 Instruction TLB miss rate/ratio + +EVENTSET +PMC3 PM_ITLB_MISS +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +Runtime (RDTSC) [s] time +CPI PMC5/PMC4 +L1 ITLB misses PMC3 +L1 ITLB miss rate PMC3/PMC4 + +LONG +Formulas: +L1 ITLB misses = PM_ITLB_MISS +L1 ITLB miss rate = PM_ITLB_MISS/PM_RUN_INST_CMPL +- +This group measures the reloads of the instruction TLB. +Misses to the HPT are counted once while misses in the Radix +tree count the number of tree levels traversed. diff --git a/collectors/likwid/groups/power9/USEFUL.txt b/collectors/likwid/groups/power9/USEFUL.txt new file mode 100644 index 0000000..bbc20a0 --- /dev/null +++ b/collectors/likwid/groups/power9/USEFUL.txt @@ -0,0 +1,22 @@ +SHORT Rate of useful instructions + +EVENTSET +PMC0 PM_RUN_SPURR +PMC1 PM_INST_DISP +PMC3 PM_RUN_PURR +PMC4 PM_RUN_INST_CMPL +PMC5 PM_RUN_CYC + +METRICS +CPI PMC5/PMC4 +Useful instr. rate [%] (PMC4/PMC1)*100.0 +Processor Utilization [%] (PMC0/PMC3)*100.0 + + +LONG +Formulas: +Useful instr. rate [%] = (PM_RUN_INST_CMPL/PM_INST_DISP)*100 +Processor Utilization [%] = (PM_RUN_SPURR/PM_RUN_PURR)*100 +-- +This performance group shows the overhead of speculative +execution of instructions and the processor utilization. diff --git a/collectors/likwid/groups/sandybridge/BRANCH.txt b/collectors/likwid/groups/sandybridge/BRANCH.txt new file mode 100644 index 0000000..b8d41b2 --- /dev/null +++ b/collectors/likwid/groups/sandybridge/BRANCH.txt @@ -0,0 +1,31 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 BR_INST_RETIRED_ALL_BRANCHES +PMC1 BR_MISP_RETIRED_ALL_BRANCHES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Branch rate PMC0/FIXC0 +Branch misprediction rate PMC1/FIXC0 +Branch misprediction ratio PMC1/PMC0 +Instructions per branch FIXC0/PMC0 + +LONG +Formulas: +Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES +Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/sandybridge/CLOCK.txt b/collectors/likwid/groups/sandybridge/CLOCK.txt new file mode 100644 index 0000000..a888d66 --- /dev/null +++ b/collectors/likwid/groups/sandybridge/CLOCK.txt @@ -0,0 +1,30 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY +PWR3 PWR_DRAM_ENERGY +UBOXFIX UNCORE_CLOCK + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +Uncore Clock [MHz] 1.E-06*UBOXFIX/time +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Power DRAM = PWR_DRAM_ENERGY / time +Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time +- +SandyBridge implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) and DRAM level. + diff --git a/collectors/likwid/groups/sandybridge/CYCLE_ACTIVITY.txt b/collectors/likwid/groups/sandybridge/CYCLE_ACTIVITY.txt new file mode 100644 index 0000000..8dbfe25 --- /dev/null +++ b/collectors/likwid/groups/sandybridge/CYCLE_ACTIVITY.txt @@ -0,0 +1,33 @@ +SHORT Cycle Activities + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING +PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING +PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Cycles without execution [%] (PMC3/FIXC1)*100 +Cycles without execution due to L1D [%] (PMC2/FIXC1)*100 +Cycles without execution due to L2 [%] (PMC0/FIXC1)*100 + +LONG +Formulas: +Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100 +Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100 +Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100 +-- +This performance group measures the cycles while waiting for data from the cache +and memory hierarchy. +CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on +any execution port. +CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is +outstanding. +CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is +outstanding. diff --git a/collectors/likwid/groups/sandybridge/CYCLE_STALLS.txt b/collectors/likwid/groups/sandybridge/CYCLE_STALLS.txt new file mode 100644 index 0000000..d66cbb1 --- /dev/null +++ b/collectors/likwid/groups/sandybridge/CYCLE_STALLS.txt @@ -0,0 +1,38 @@ +SHORT Cycle Activities (Stalls) + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING +PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING +PMC3 CYCLE_ACTIVITY_STALLS_TOTAL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Total execution stalls PMC3 +Stalls caused by L1D misses [%] (PMC2/PMC3)*100 +Stalls caused by L2 misses [%] (PMC0/PMC3)*100 +Execution stall rate [%] (PMC3/FIXC1)*100 +Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100 +Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100 + +LONG +Formulas: +Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL +Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100 +-- +This performance group measures the stalls caused by data traffic in the cache +hierarchy. +CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls. +CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand +load is outstanding. +CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand +load is outstanding. diff --git a/collectors/likwid/groups/sandybridge/DATA.txt b/collectors/likwid/groups/sandybridge/DATA.txt new file mode 100644 index 0000000..967cbad --- /dev/null +++ b/collectors/likwid/groups/sandybridge/DATA.txt @@ -0,0 +1,22 @@ +SHORT Load to store ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_UOPS_RETIRED_LOADS +PMC1 MEM_UOPS_RETIRED_STORES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Load to store ratio PMC0/PMC1 + +LONG +Formulas: +Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES +- +This is a metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/sandybridge/DIVIDE.txt b/collectors/likwid/groups/sandybridge/DIVIDE.txt new file mode 100644 index 0000000..504181c --- /dev/null +++ b/collectors/likwid/groups/sandybridge/DIVIDE.txt @@ -0,0 +1,24 @@ +SHORT Divide unit information + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ARITH_NUM_DIV +PMC1 ARITH_FPU_DIV_ACTIVE + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Number of divide ops PMC0 +Avg. divide unit usage duration PMC1/PMC0 + +LONG +Formulas: +Number of divide ops = ARITH_NUM_DIV +Avg. divide unit usage duration = ARITH_FPU_DIV_ACTIVE/ARITH_NUM_DIV +- +This performance group measures the average latency of divide operations diff --git a/collectors/likwid/groups/sandybridge/ENERGY.txt b/collectors/likwid/groups/sandybridge/ENERGY.txt new file mode 100644 index 0000000..9898c70 --- /dev/null +++ b/collectors/likwid/groups/sandybridge/ENERGY.txt @@ -0,0 +1,37 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +TMP0 TEMP_CORE +PWR0 PWR_PKG_ENERGY +PWR1 PWR_PP0_ENERGY +PWR2 PWR_PP1_ENERGY +PWR3 PWR_DRAM_ENERGY + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Temperature [C] TMP0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy PP0 [J] PWR1 +Power PP0 [W] PWR1/time +Energy PP1 [J] PWR2 +Power PP1 [W] PWR2/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Power PP0 = PWR_PP0_ENERGY / time +Power PP1 = PWR_PP1_ENERGY / time +Power DRAM = PWR_DRAM_ENERGY / time +- +SandyBridge implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) level. + diff --git a/collectors/likwid/groups/sandybridge/FALSE_SHARE.txt b/collectors/likwid/groups/sandybridge/FALSE_SHARE.txt new file mode 100644 index 0000000..fbec3f4 --- /dev/null +++ b/collectors/likwid/groups/sandybridge/FALSE_SHARE.txt @@ -0,0 +1,25 @@ +SHORT False sharing + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM +PMC2 MEM_LOAD_UOPS_RETIRED_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Local LLC false sharing [MByte] 1.E-06*PMC0*64 +Local LLC false sharing rate PMC0/PMC2 + +LONG +Formulas: +Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM*64 +Local LLC false sharing rate = MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL +- +False-sharing of cache lines can dramatically reduce the performance of an +application. This performance group measures the L3 traffic induced by false-sharing. +The false-sharing rate uses all memory loads as reference. diff --git a/collectors/likwid/groups/sandybridge/FLOPS_AVX.txt b/collectors/likwid/groups/sandybridge/FLOPS_AVX.txt new file mode 100644 index 0000000..5a3f14f --- /dev/null +++ b/collectors/likwid/groups/sandybridge/FLOPS_AVX.txt @@ -0,0 +1,26 @@ +SHORT Packed AVX MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 SIMD_FP_256_PACKED_SINGLE +PMC1 SIMD_FP_256_PACKED_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Packed SP [MFLOP/s] 1.0E-06*(PMC0*8.0)/time +Packed DP [MFLOP/s] 1.0E-06*(PMC1*4.0)/time + +LONG +Formulas: +Packed SP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime +Packed DP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime +- +Packed 32b AVX FLOPs rates. +Please note that the current FLOP measurements on SandyBridge are +potentially wrong. So you cannot trust these counters at the moment! + diff --git a/collectors/likwid/groups/sandybridge/FLOPS_DP.txt b/collectors/likwid/groups/sandybridge/FLOPS_DP.txt new file mode 100644 index 0000000..91f8a86 --- /dev/null +++ b/collectors/likwid/groups/sandybridge/FLOPS_DP.txt @@ -0,0 +1,33 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE +PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE +PMC2 SIMD_FP_256_PACKED_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time +AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) + +LONG +Formulas: +DP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime +AVX DP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime +Vectorization ratio = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE) +- +SSE scalar and packed double precision FLOP rates. +Please note that the current FLOP measurements on SandyBridge are potentially +wrong. So you cannot trust these counters at the moment! + diff --git a/collectors/likwid/groups/sandybridge/FLOPS_SP.txt b/collectors/likwid/groups/sandybridge/FLOPS_SP.txt new file mode 100644 index 0000000..930a988 --- /dev/null +++ b/collectors/likwid/groups/sandybridge/FLOPS_SP.txt @@ -0,0 +1,33 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE +PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE +PMC2 SIMD_FP_256_PACKED_SINGLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time +AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) + +LONG +Formulas: +SP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime +AVX SP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime +Vectorization ratio = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE) +- +SSE scalar and packed single precision FLOP rates. +Please note that the current FLOP measurements on SandyBridge are potentially +wrong. So you cannot trust these counters at the moment! + diff --git a/collectors/likwid/groups/sandybridge/ICACHE.txt b/collectors/likwid/groups/sandybridge/ICACHE.txt new file mode 100644 index 0000000..f1e2335 --- /dev/null +++ b/collectors/likwid/groups/sandybridge/ICACHE.txt @@ -0,0 +1,33 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ICACHE_ACCESSES +PMC1 ICACHE_MISSES +PMC2 ICACHE_IFETCH_STALL +PMC3 ILD_STALL_IQ_FULL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1I request rate PMC0/FIXC0 +L1I miss rate PMC1/FIXC0 +L1I miss ratio PMC1/PMC0 +L1I stalls PMC2 +L1I stall rate PMC2/FIXC0 +L1I queue full stalls PMC3 +L1I queue full stall rate PMC3/FIXC0 + +LONG +Formulas: +L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY +L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY +L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES +L1I stalls = ICACHE_IFETCH_STALL +L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY +- +This group measures some L1 instruction cache metrics. diff --git a/collectors/likwid/groups/sandybridge/L2.txt b/collectors/likwid/groups/sandybridge/L2.txt new file mode 100644 index 0000000..1feb44c --- /dev/null +++ b/collectors/likwid/groups/sandybridge/L2.txt @@ -0,0 +1,38 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPLACEMENT +PMC1 L1D_M_EVICT +PMC2 ICACHE_MISSES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2D load data volume [GBytes] 1.0E-09*PMC0*64.0 +L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time +L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0 + +LONG +Formulas: +L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time +L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0 +L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time +L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0 +L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64/time +L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64 +- +Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L1 and the number of modified cache lines +evicted from the L1. The group also output total data volume transferred between +L2 and L1. Note that this bandwidth also includes data transfers due to a write +allocate load on a store miss in L1 and traffic caused by misses in the +L1 instruction cache. + diff --git a/collectors/likwid/groups/sandybridge/L2CACHE.txt b/collectors/likwid/groups/sandybridge/L2CACHE.txt new file mode 100644 index 0000000..fbc3745 --- /dev/null +++ b/collectors/likwid/groups/sandybridge/L2CACHE.txt @@ -0,0 +1,34 @@ +SHORT L2 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_TRANS_ALL_REQUESTS +PMC1 L2_RQSTS_MISS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 request rate PMC0/FIXC0 +L2 miss rate PMC1/FIXC0 +L2 miss ratio PMC1/PMC0 + +LONG +Formulas: +L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY +L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY +L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS +- +This group measures the locality of your data accesses with regard to the +L2 cache. L2 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L2 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L2 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/sandybridge/L3.txt b/collectors/likwid/groups/sandybridge/L3.txt new file mode 100644 index 0000000..f63a918 --- /dev/null +++ b/collectors/likwid/groups/sandybridge/L3.txt @@ -0,0 +1,36 @@ +SHORT L3 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_LINES_IN_ALL +PMC1 L2_TRANS_L2_WB + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L3 load data volume [GBytes] 1.0E-09*PMC0*64.0 +L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 + +LONG +Formulas: +L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time +L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0 +L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time +L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0 +L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time +L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64 +- +Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L2 and the number of modified cache lines +evicted from the L2. This group also output data volume transferred between the +L3 and measured cores L2 caches. Note that this bandwidth also includes data +transfers due to a write allocate load on a store miss in L2. + diff --git a/collectors/likwid/groups/sandybridge/L3CACHE.txt b/collectors/likwid/groups/sandybridge/L3CACHE.txt new file mode 100644 index 0000000..3dbb6cc --- /dev/null +++ b/collectors/likwid/groups/sandybridge/L3CACHE.txt @@ -0,0 +1,36 @@ +SHORT L3 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0:MATCH0=0x0081:MATCH1=0x3fffc0 OFFCORE_RESPONSE_0_OPTIONS +PMC1:MATCH0=0x0081:MATCH1=0x1 OFFCORE_RESPONSE_1_OPTIONS +PMC2 L1D_REPLACEMENT + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 request rate PMC1/FIXC0 +L3 miss rate PMC0/FIXC0 +L3 miss ratio PMC0/PMC1 + +LONG +Formulas: +L3 request rate = OFFCORE_RESPONSE_1_OPTIONS:MATCH0=0x0081:MATCH1=0x1/INSTR_RETIRED_ANY +L3 miss rate = OFFCORE_RESPONSE_0_OPTIONS:MATCH0=0x0081:MATCH1=0x3fffc0/INSTR_RETIRED_ANY +L3 miss ratio = OFFCORE_RESPONSE_0_OPTIONS:MATCH0=0x0081:MATCH1=0x3fffc0/OFFCORE_RESPONSE_1_OPTIONS:MATCH0=0x0081:MATCH1=0x1 +- +This group measures the locality of your data accesses with regard to the +L3 cache. L3 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L3 miss rate gives a measure how often it was necessary to get +cache lines from L3 compared to all loaded cache lines in L1. +And finally L3 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/sandybridge/PORT_USAGE.txt b/collectors/likwid/groups/sandybridge/PORT_USAGE.txt new file mode 100644 index 0000000..d509607 --- /dev/null +++ b/collectors/likwid/groups/sandybridge/PORT_USAGE.txt @@ -0,0 +1,40 @@ +SHORT Execution port utilization + +REQUIRE_NOHT + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_DISPATCHED_PORT_PORT_0 +PMC1 UOPS_DISPATCHED_PORT_PORT_1 +PMC2 UOPS_DISPATCHED_PORT_PORT_2 +PMC3 UOPS_DISPATCHED_PORT_PORT_3 +PMC4 UOPS_DISPATCHED_PORT_PORT_4 +PMC5 UOPS_DISPATCHED_PORT_PORT_5 + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Port0 usage ratio PMC0/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5) +Port1 usage ratio PMC1/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5) +Port2 usage ratio PMC2/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5) +Port3 usage ratio PMC3/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5) +Port4 usage ratio PMC4/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5) +Port5 usage ratio PMC5/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5) + +LONG +Formulas: +Port0 usage ratio = UOPS_DISPATCHED_PORT_PORT_0/SUM(UOPS_DISPATCHED_PORT_PORT_*) +Port1 usage ratio = UOPS_DISPATCHED_PORT_PORT_1/SUM(UOPS_DISPATCHED_PORT_PORT_*) +Port2 usage ratio = UOPS_DISPATCHED_PORT_PORT_2/SUM(UOPS_DISPATCHED_PORT_PORT_*) +Port3 usage ratio = UOPS_DISPATCHED_PORT_PORT_3/SUM(UOPS_DISPATCHED_PORT_PORT_*) +Port4 usage ratio = UOPS_DISPATCHED_PORT_PORT_4/SUM(UOPS_DISPATCHED_PORT_PORT_*) +Port5 usage ratio = UOPS_DISPATCHED_PORT_PORT_5/SUM(UOPS_DISPATCHED_PORT_PORT_*) +- +This group measures the execution port utilization in a CPU core. The group can +only be measured when HyperThreading is disabled because only then each CPU core +can program eight counters. diff --git a/collectors/likwid/groups/sandybridge/RECOVERY.txt b/collectors/likwid/groups/sandybridge/RECOVERY.txt new file mode 100644 index 0000000..7928ee4 --- /dev/null +++ b/collectors/likwid/groups/sandybridge/RECOVERY.txt @@ -0,0 +1,22 @@ +SHORT Recovery duration + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 INT_MISC_RECOVERY_CYCLES +PMC1 INT_MISC_RECOVERY_COUNT + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Avg. recovery duration PMC0/PMC1 + +LONG +Formulas: +Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT +- +This group measures the duration of recoveries after SSE exception, memory +disambiguation, etc... diff --git a/collectors/likwid/groups/sandybridge/TLB_DATA.txt b/collectors/likwid/groups/sandybridge/TLB_DATA.txt new file mode 100644 index 0000000..8d94e05 --- /dev/null +++ b/collectors/likwid/groups/sandybridge/TLB_DATA.txt @@ -0,0 +1,35 @@ +SHORT L2 data TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 DTLB_LOAD_MISSES_CAUSES_A_WALK +PMC1 DTLB_STORE_MISSES_CAUSES_A_WALK +PMC2 DTLB_LOAD_MISSES_WALK_DURATION +PMC3 DTLB_STORE_MISSES_WALK_DURATION + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 DTLB load misses PMC0 +L1 DTLB load miss rate PMC0/FIXC0 +L1 DTLB load miss duration [Cyc] PMC2/PMC0 +L1 DTLB store misses PMC1 +L1 DTLB store miss rate PMC1/FIXC0 +L1 DTLB store miss duration [Cyc] PMC3/PMC1 + +LONG +Formulas: +L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK +L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK +L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK +L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK +- +The DTLB load and store miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/sandybridge/TLB_INSTR.txt b/collectors/likwid/groups/sandybridge/TLB_INSTR.txt new file mode 100644 index 0000000..235d977 --- /dev/null +++ b/collectors/likwid/groups/sandybridge/TLB_INSTR.txt @@ -0,0 +1,28 @@ +SHORT L1 Instruction TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ITLB_MISSES_CAUSES_A_WALK +PMC1 ITLB_MISSES_WALK_DURATION + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 ITLB misses PMC0 +L1 ITLB miss rate PMC0/FIXC0 +L1 ITLB miss duration [Cyc] PMC1/PMC0 + + +LONG +Formulas: +L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK +L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK +- +The ITLB miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/sandybridge/TMA.txt b/collectors/likwid/groups/sandybridge/TMA.txt new file mode 100644 index 0000000..afb4126 --- /dev/null +++ b/collectors/likwid/groups/sandybridge/TMA.txt @@ -0,0 +1,48 @@ +SHORT Top down cycle allocation + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_ANY +PMC1 UOPS_RETIRED_RETIRE_SLOTS +PMC2 IDQ_UOPS_NOT_DELIVERED_CORE +PMC3 INT_MISC_RECOVERY_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +IPC FIXC0/FIXC1 +Total Slots 4*FIXC1 +Slots Retired PMC1 +Fetch Bubbles PMC2 +Recovery Bubbles 4*PMC3 +Front End [%] PMC2/(4*FIXC1)*100 +Speculation [%] (PMC0-PMC1+(4*PMC3))/(4*FIXC1)*100 +Retiring [%] PMC1/(4*FIXC1)*100 +Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100 + +LONG +Formulas: +Total Slots = 4*CPU_CLK_UNHALTED_CORE +Slots Retired = UOPS_RETIRED_RETIRE_SLOTS +Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE +Recovery Bubbles = 4*INT_MISC_RECOVERY_CYCLES +Front End [%] = IDQ_UOPS_NOT_DELIVERED_CORE/(4*CPU_CLK_UNHALTED_CORE)*100 +Speculation [%] = (UOPS_ISSUED_ANY-UOPS_RETIRED_RETIRE_SLOTS+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)*100 +Retiring [%] = UOPS_RETIRED_RETIRE_SLOTS/(4*CPU_CLK_UNHALTED_CORE)*100 +Back End [%] = (1-((IDQ_UOPS_NOT_DELIVERED_CORE+UOPS_ISSUED_ANY+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)))*100 +-- +This performance group measures cycles to determine percentage of time spent in +front end, back end, retiring and speculation. These metrics are published and +verified by Intel. Further information: +Webpage describing Top-Down Method and its usage in Intel vTune: +https://software.intel.com/en-us/vtune-amplifier-help-tuning-applications-using-a-top-down-microarchitecture-analysis-method +Paper by Yasin Ahmad: +https://sites.google.com/site/analysismethods/yasin-pubs/TopDown-Yasin-ISPASS14.pdf?attredirects=0 +Slides by Yasin Ahmad: +http://www.cs.technion.ac.il/~erangi/TMA_using_Linux_perf__Ahmad_Yasin.pdf +The performance group was originally published here: +http://perf.mvermeulen.com/2018/04/14/top-down-performance-counter-analysis-part-1-likwid/ diff --git a/collectors/likwid/groups/sandybridge/UOPS.txt b/collectors/likwid/groups/sandybridge/UOPS.txt new file mode 100644 index 0000000..a4d35d8 --- /dev/null +++ b/collectors/likwid/groups/sandybridge/UOPS.txt @@ -0,0 +1,32 @@ +SHORT UOPs execution info + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_ANY +PMC1 UOPS_EXECUTED_THREAD +PMC2 UOPS_RETIRED_ALL + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Issued UOPs PMC0 +Executed UOPs PMC1 +Retired UOPs PMC2 + +LONG +Formulas: +Issued UOPs = UOPS_ISSUED_ANY +Executed UOPs = UOPS_EXECUTED_THREAD +Retired UOPs = UOPS_RETIRED_ALL +- +This group returns information about the instruction pipeline. It measures the +issued, executed and retired uOPs and returns the number of uOPs which were issued +but not executed as well as the number of uOPs which were executed but never retired. +The executed but not retired uOPs commonly come from speculatively executed branches. + diff --git a/collectors/likwid/groups/sandybridge/UOPS_EXEC.txt b/collectors/likwid/groups/sandybridge/UOPS_EXEC.txt new file mode 100644 index 0000000..7042df7 --- /dev/null +++ b/collectors/likwid/groups/sandybridge/UOPS_EXEC.txt @@ -0,0 +1,31 @@ +SHORT UOPs execution + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_EXECUTED_USED_CYCLES +PMC1 UOPS_EXECUTED_STALL_CYCLES +PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES +PMC3:EDGEDETECT UOPS_EXECUTED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio [%] 100*PMC0/PMC2 +Unused cycles ratio [%] 100*PMC1/PMC2 +Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT + + +LONG +Formulas: +Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT +- +This performance group returns the ratios of used and unused cycles regarding +the execution stage in the pipeline. Used cycles are all cycles where uOPs are +executed while unused cycles refer to pipeline stalls. Moreover, the group +calculates the average stall duration in cycles. diff --git a/collectors/likwid/groups/sandybridge/UOPS_ISSUE.txt b/collectors/likwid/groups/sandybridge/UOPS_ISSUE.txt new file mode 100644 index 0000000..9aac923 --- /dev/null +++ b/collectors/likwid/groups/sandybridge/UOPS_ISSUE.txt @@ -0,0 +1,31 @@ +SHORT UOPs issueing + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_USED_CYCLES +PMC1 UOPS_ISSUED_STALL_CYCLES +PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES +PMC3:EDGEDETECT UOPS_ISSUED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio [%] 100*PMC0/PMC2 +Unused cycles ratio [%] 100*PMC1/PMC2 +Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT + + +LONG +Formulas: +Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT +- +This performance group returns the ratios of used and unused cycles regarding +the issue stage in the pipeline. Used cycles are all cycles where uOPs are +issued while unused cycles refer to pipeline stalls. Moreover, the group +calculates the average stall duration in cycles. diff --git a/collectors/likwid/groups/sandybridge/UOPS_RETIRE.txt b/collectors/likwid/groups/sandybridge/UOPS_RETIRE.txt new file mode 100644 index 0000000..0f37585 --- /dev/null +++ b/collectors/likwid/groups/sandybridge/UOPS_RETIRE.txt @@ -0,0 +1,31 @@ +SHORT UOPs retirement + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_RETIRED_USED_CYCLES +PMC1 UOPS_RETIRED_STALL_CYCLES +PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES +PMC3:EDGEDETECT UOPS_RETIRED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio [%] 100*PMC0/PMC2 +Unused cycles ratio [%] 100*PMC1/PMC2 +Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT + + +LONG +Formulas: +Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT +- +This performance group returns the ratios of used and unused cycles regarding +the retirement stage in the pipeline (re-order buffer). Used cycles are all +cycles where uOPs are retired while unused cycles refer to pipeline stalls. +Moreover, the group calculates the average stall duration in cycles. diff --git a/collectors/likwid/groups/sandybridgeEP/BRANCH.txt b/collectors/likwid/groups/sandybridgeEP/BRANCH.txt new file mode 100644 index 0000000..b8d41b2 --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/BRANCH.txt @@ -0,0 +1,31 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 BR_INST_RETIRED_ALL_BRANCHES +PMC1 BR_MISP_RETIRED_ALL_BRANCHES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Branch rate PMC0/FIXC0 +Branch misprediction rate PMC1/FIXC0 +Branch misprediction ratio PMC1/PMC0 +Instructions per branch FIXC0/PMC0 + +LONG +Formulas: +Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES +Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/sandybridgeEP/CACHES.txt b/collectors/likwid/groups/sandybridgeEP/CACHES.txt new file mode 100644 index 0000000..345b8f4 --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/CACHES.txt @@ -0,0 +1,97 @@ +SHORT Some data from the CBOXes + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPLACEMENT +PMC1 L1D_M_EVICT +PMC2 L2_LINES_IN_ALL +PMC3 L2_TRANS_L2_WB +CBOX0C0:STATE=0x3F LLC_LOOKUP_DATA_READ +CBOX1C0:STATE=0x3F LLC_LOOKUP_DATA_READ +CBOX2C0:STATE=0x3F LLC_LOOKUP_DATA_READ +CBOX3C0:STATE=0x3F LLC_LOOKUP_DATA_READ +CBOX4C0:STATE=0x3F LLC_LOOKUP_DATA_READ +CBOX5C0:STATE=0x3F LLC_LOOKUP_DATA_READ +CBOX6C0:STATE=0x3F LLC_LOOKUP_DATA_READ +CBOX7C0:STATE=0x3F LLC_LOOKUP_DATA_READ +CBOX0C1 LLC_VICTIMS_M_STATE +CBOX1C1 LLC_VICTIMS_M_STATE +CBOX2C1 LLC_VICTIMS_M_STATE +CBOX3C1 LLC_VICTIMS_M_STATE +CBOX4C1 LLC_VICTIMS_M_STATE +CBOX5C1 LLC_VICTIMS_M_STATE +CBOX6C1 LLC_VICTIMS_M_STATE +CBOX7C1 LLC_VICTIMS_M_STATE +MBOX0C0 CAS_COUNT_RD +MBOX0C1 CAS_COUNT_WR +MBOX1C0 CAS_COUNT_RD +MBOX1C1 CAS_COUNT_WR +MBOX2C0 CAS_COUNT_RD +MBOX2C1 CAS_COUNT_WR +MBOX3C0 CAS_COUNT_RD +MBOX3C1 CAS_COUNT_WR + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0 +L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 +L3 to L2 load bandwidth [MBytes/s] 1.0E-06*PMC2*64.0/time +L3 to L2 load data volume [GBytes] 1.0E-09*PMC2*64.0 +L2 to L3 evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time +L2 to L3 evict data volume [GBytes] 1.0E-09*PMC3*64.0 +L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time +L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0 +System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F)*64.0/time +System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F)*64.0 +L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)*64.0/time +L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)*64.0 +L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)*64.0/time +L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)*64.0 +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0 + + +LONG +Formulas: +L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time +L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64 +L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time +L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64 +L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time +L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64 +L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time +L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64 +L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time +L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64 +L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time +L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64 +System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F))*64/time +System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F))*64 +L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M_STATE))*64/time +L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M_STATE))*64 +L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F)+SUM(LLC_VICTIMS_M_STATE))*64/time +L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F)+SUM(LLC_VICTIMS_M_STATE))*64 +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time +Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time +Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time +Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0 +- +Group to measure cache transfers between L1 and Memory. Please notice that the +L3 to/from system metrics contain any traffic to the system (memory, +Intel QPI, etc.) but don't seem to handle anything because commonly memory read +bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth. diff --git a/collectors/likwid/groups/sandybridgeEP/CLOCK.txt b/collectors/likwid/groups/sandybridgeEP/CLOCK.txt new file mode 100644 index 0000000..a888d66 --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/CLOCK.txt @@ -0,0 +1,30 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY +PWR3 PWR_DRAM_ENERGY +UBOXFIX UNCORE_CLOCK + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +Uncore Clock [MHz] 1.E-06*UBOXFIX/time +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Power DRAM = PWR_DRAM_ENERGY / time +Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time +- +SandyBridge implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) and DRAM level. + diff --git a/collectors/likwid/groups/sandybridgeEP/CYCLE_ACTIVITY.txt b/collectors/likwid/groups/sandybridgeEP/CYCLE_ACTIVITY.txt new file mode 100644 index 0000000..8dbfe25 --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/CYCLE_ACTIVITY.txt @@ -0,0 +1,33 @@ +SHORT Cycle Activities + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING +PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING +PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Cycles without execution [%] (PMC3/FIXC1)*100 +Cycles without execution due to L1D [%] (PMC2/FIXC1)*100 +Cycles without execution due to L2 [%] (PMC0/FIXC1)*100 + +LONG +Formulas: +Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100 +Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100 +Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100 +-- +This performance group measures the cycles while waiting for data from the cache +and memory hierarchy. +CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on +any execution port. +CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is +outstanding. +CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is +outstanding. diff --git a/collectors/likwid/groups/sandybridgeEP/CYCLE_STALLS.txt b/collectors/likwid/groups/sandybridgeEP/CYCLE_STALLS.txt new file mode 100644 index 0000000..d66cbb1 --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/CYCLE_STALLS.txt @@ -0,0 +1,38 @@ +SHORT Cycle Activities (Stalls) + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING +PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING +PMC3 CYCLE_ACTIVITY_STALLS_TOTAL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Total execution stalls PMC3 +Stalls caused by L1D misses [%] (PMC2/PMC3)*100 +Stalls caused by L2 misses [%] (PMC0/PMC3)*100 +Execution stall rate [%] (PMC3/FIXC1)*100 +Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100 +Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100 + +LONG +Formulas: +Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL +Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100 +-- +This performance group measures the stalls caused by data traffic in the cache +hierarchy. +CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls. +CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand +load is outstanding. +CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand +load is outstanding. diff --git a/collectors/likwid/groups/sandybridgeEP/DATA.txt b/collectors/likwid/groups/sandybridgeEP/DATA.txt new file mode 100644 index 0000000..967cbad --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/DATA.txt @@ -0,0 +1,22 @@ +SHORT Load to store ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_UOPS_RETIRED_LOADS +PMC1 MEM_UOPS_RETIRED_STORES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Load to store ratio PMC0/PMC1 + +LONG +Formulas: +Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES +- +This is a metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/sandybridgeEP/DIVIDE.txt b/collectors/likwid/groups/sandybridgeEP/DIVIDE.txt new file mode 100644 index 0000000..504181c --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/DIVIDE.txt @@ -0,0 +1,24 @@ +SHORT Divide unit information + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ARITH_NUM_DIV +PMC1 ARITH_FPU_DIV_ACTIVE + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Number of divide ops PMC0 +Avg. divide unit usage duration PMC1/PMC0 + +LONG +Formulas: +Number of divide ops = ARITH_NUM_DIV +Avg. divide unit usage duration = ARITH_FPU_DIV_ACTIVE/ARITH_NUM_DIV +- +This performance group measures the average latency of divide operations diff --git a/collectors/likwid/groups/sandybridgeEP/ENERGY.txt b/collectors/likwid/groups/sandybridgeEP/ENERGY.txt new file mode 100644 index 0000000..1ab4ef3 --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/ENERGY.txt @@ -0,0 +1,33 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +TMP0 TEMP_CORE +PWR0 PWR_PKG_ENERGY +PWR1 PWR_PP0_ENERGY +PWR3 PWR_DRAM_ENERGY + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Temperature [C] TMP0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy PP0 [J] PWR1 +Power PP0 [W] PWR1/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Power PP0 = PWR_PP0_ENERGY / time +Power DRAM = PWR_DRAM_ENERGY / time +- +SandyBridge implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) level. + diff --git a/collectors/likwid/groups/sandybridgeEP/FALSE_SHARE.txt b/collectors/likwid/groups/sandybridgeEP/FALSE_SHARE.txt new file mode 100644 index 0000000..27f568a --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/FALSE_SHARE.txt @@ -0,0 +1,27 @@ +SHORT False sharing + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM +PMC2 MEM_LOAD_UOPS_RETIRED_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Local LLC false sharing [MByte] 1.E-06*PMC0*64 +Local LLC false sharing rate PMC0/PMC2 + +LONG +Formulas: +Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM*64 +Local LLC false sharing rate = MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL +- +False-sharing of cache lines can dramatically reduce the performance of an +application. This performance group measures the L3 traffic induced by false-sharing. +The false-sharing rate uses all memory loads as reference. +Intel SandyBridge EP CPUs do not provide the events to measure the false-sharing +over CPU socket boundaries. diff --git a/collectors/likwid/groups/sandybridgeEP/FLOPS_AVX.txt b/collectors/likwid/groups/sandybridgeEP/FLOPS_AVX.txt new file mode 100644 index 0000000..5a3f14f --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/FLOPS_AVX.txt @@ -0,0 +1,26 @@ +SHORT Packed AVX MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 SIMD_FP_256_PACKED_SINGLE +PMC1 SIMD_FP_256_PACKED_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Packed SP [MFLOP/s] 1.0E-06*(PMC0*8.0)/time +Packed DP [MFLOP/s] 1.0E-06*(PMC1*4.0)/time + +LONG +Formulas: +Packed SP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime +Packed DP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime +- +Packed 32b AVX FLOPs rates. +Please note that the current FLOP measurements on SandyBridge are +potentially wrong. So you cannot trust these counters at the moment! + diff --git a/collectors/likwid/groups/sandybridgeEP/FLOPS_DP.txt b/collectors/likwid/groups/sandybridgeEP/FLOPS_DP.txt new file mode 100644 index 0000000..91f8a86 --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/FLOPS_DP.txt @@ -0,0 +1,33 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE +PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE +PMC2 SIMD_FP_256_PACKED_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time +AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) + +LONG +Formulas: +DP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime +AVX DP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime +Vectorization ratio = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE) +- +SSE scalar and packed double precision FLOP rates. +Please note that the current FLOP measurements on SandyBridge are potentially +wrong. So you cannot trust these counters at the moment! + diff --git a/collectors/likwid/groups/sandybridgeEP/FLOPS_SP.txt b/collectors/likwid/groups/sandybridgeEP/FLOPS_SP.txt new file mode 100644 index 0000000..930a988 --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/FLOPS_SP.txt @@ -0,0 +1,33 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE +PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE +PMC2 SIMD_FP_256_PACKED_SINGLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time +AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) + +LONG +Formulas: +SP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime +AVX SP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime +Vectorization ratio = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE) +- +SSE scalar and packed single precision FLOP rates. +Please note that the current FLOP measurements on SandyBridge are potentially +wrong. So you cannot trust these counters at the moment! + diff --git a/collectors/likwid/groups/sandybridgeEP/ICACHE.txt b/collectors/likwid/groups/sandybridgeEP/ICACHE.txt new file mode 100644 index 0000000..f1e2335 --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/ICACHE.txt @@ -0,0 +1,33 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ICACHE_ACCESSES +PMC1 ICACHE_MISSES +PMC2 ICACHE_IFETCH_STALL +PMC3 ILD_STALL_IQ_FULL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1I request rate PMC0/FIXC0 +L1I miss rate PMC1/FIXC0 +L1I miss ratio PMC1/PMC0 +L1I stalls PMC2 +L1I stall rate PMC2/FIXC0 +L1I queue full stalls PMC3 +L1I queue full stall rate PMC3/FIXC0 + +LONG +Formulas: +L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY +L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY +L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES +L1I stalls = ICACHE_IFETCH_STALL +L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY +- +This group measures some L1 instruction cache metrics. diff --git a/collectors/likwid/groups/sandybridgeEP/L2.txt b/collectors/likwid/groups/sandybridgeEP/L2.txt new file mode 100644 index 0000000..1feb44c --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/L2.txt @@ -0,0 +1,38 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPLACEMENT +PMC1 L1D_M_EVICT +PMC2 ICACHE_MISSES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2D load data volume [GBytes] 1.0E-09*PMC0*64.0 +L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time +L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0 + +LONG +Formulas: +L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time +L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0 +L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time +L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0 +L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64/time +L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64 +- +Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L1 and the number of modified cache lines +evicted from the L1. The group also output total data volume transferred between +L2 and L1. Note that this bandwidth also includes data transfers due to a write +allocate load on a store miss in L1 and traffic caused by misses in the +L1 instruction cache. + diff --git a/collectors/likwid/groups/sandybridgeEP/L2CACHE.txt b/collectors/likwid/groups/sandybridgeEP/L2CACHE.txt new file mode 100644 index 0000000..fbc3745 --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/L2CACHE.txt @@ -0,0 +1,34 @@ +SHORT L2 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_TRANS_ALL_REQUESTS +PMC1 L2_RQSTS_MISS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 request rate PMC0/FIXC0 +L2 miss rate PMC1/FIXC0 +L2 miss ratio PMC1/PMC0 + +LONG +Formulas: +L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY +L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY +L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS +- +This group measures the locality of your data accesses with regard to the +L2 cache. L2 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L2 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L2 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/sandybridgeEP/L3.txt b/collectors/likwid/groups/sandybridgeEP/L3.txt new file mode 100644 index 0000000..f63a918 --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/L3.txt @@ -0,0 +1,36 @@ +SHORT L3 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_LINES_IN_ALL +PMC1 L2_TRANS_L2_WB + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L3 load data volume [GBytes] 1.0E-09*PMC0*64.0 +L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 + +LONG +Formulas: +L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time +L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0 +L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time +L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0 +L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time +L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64 +- +Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L2 and the number of modified cache lines +evicted from the L2. This group also output data volume transferred between the +L3 and measured cores L2 caches. Note that this bandwidth also includes data +transfers due to a write allocate load on a store miss in L2. + diff --git a/collectors/likwid/groups/sandybridgeEP/L3CACHE.txt b/collectors/likwid/groups/sandybridgeEP/L3CACHE.txt new file mode 100644 index 0000000..3dbb6cc --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/L3CACHE.txt @@ -0,0 +1,36 @@ +SHORT L3 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0:MATCH0=0x0081:MATCH1=0x3fffc0 OFFCORE_RESPONSE_0_OPTIONS +PMC1:MATCH0=0x0081:MATCH1=0x1 OFFCORE_RESPONSE_1_OPTIONS +PMC2 L1D_REPLACEMENT + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 request rate PMC1/FIXC0 +L3 miss rate PMC0/FIXC0 +L3 miss ratio PMC0/PMC1 + +LONG +Formulas: +L3 request rate = OFFCORE_RESPONSE_1_OPTIONS:MATCH0=0x0081:MATCH1=0x1/INSTR_RETIRED_ANY +L3 miss rate = OFFCORE_RESPONSE_0_OPTIONS:MATCH0=0x0081:MATCH1=0x3fffc0/INSTR_RETIRED_ANY +L3 miss ratio = OFFCORE_RESPONSE_0_OPTIONS:MATCH0=0x0081:MATCH1=0x3fffc0/OFFCORE_RESPONSE_1_OPTIONS:MATCH0=0x0081:MATCH1=0x1 +- +This group measures the locality of your data accesses with regard to the +L3 cache. L3 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L3 miss rate gives a measure how often it was necessary to get +cache lines from L3 compared to all loaded cache lines in L1. +And finally L3 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/sandybridgeEP/MEM.txt b/collectors/likwid/groups/sandybridgeEP/MEM.txt new file mode 100644 index 0000000..0be0645 --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/MEM.txt @@ -0,0 +1,40 @@ +SHORT Main memory bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +MBOX0C0 CAS_COUNT_RD +MBOX0C1 CAS_COUNT_WR +MBOX1C0 CAS_COUNT_RD +MBOX1C1 CAS_COUNT_WR +MBOX2C0 CAS_COUNT_RD +MBOX2C1 CAS_COUNT_WR +MBOX3C0 CAS_COUNT_RD +MBOX3C1 CAS_COUNT_WR + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0 + +LONG +Formulas: +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time +Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time +Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time +Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0 +- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Since this group is based on Uncore events it is only possible to measure on a +per socket base. Also outputs total data volume transferred from main memory. + diff --git a/collectors/likwid/groups/sandybridgeEP/MEM_DP.txt b/collectors/likwid/groups/sandybridgeEP/MEM_DP.txt new file mode 100644 index 0000000..f2d68ba --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/MEM_DP.txt @@ -0,0 +1,66 @@ +SHORT Overview of arithmetic and main memory performance + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY +PWR3 PWR_DRAM_ENERGY +PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE +PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE +PMC2 SIMD_FP_256_PACKED_DOUBLE +MBOX0C0 CAS_COUNT_RD +MBOX0C1 CAS_COUNT_WR +MBOX1C0 CAS_COUNT_RD +MBOX1C1 CAS_COUNT_WR +MBOX2C0 CAS_COUNT_RD +MBOX2C1 CAS_COUNT_WR +MBOX3C0 CAS_COUNT_RD +MBOX3C1 CAS_COUNT_WR + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time +MFLOP/s 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time +AVX [MFLOP/s] 1.0E-06*(PMC2*4.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0 +Operational intensity (PMC0*2.0+PMC1+PMC2*4.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0) + +LONG +Formulas: +Power [W] = PWR_PKG_ENERGY/runtime +Power DRAM [W] = PWR_DRAM_ENERGY/runtime +MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime +AVX [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time +Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time +Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time +Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0 +Operational intensity = (FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/((SUM(MBOXxC0)+SUM(MBOXxC1))*64.0) +-- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Since this group is based on Uncore events it is only possible to measure on +a per socket base. Also outputs total data volume transferred from main memory. +SSE scalar and packed double precision FLOP rates. Also reports on packed AVX +32b instructions. Please note that the current FLOP measurements on SandyBridge +are potentially wrong. So you cannot trust these counters at the moment! +The operational intensity is calculated using the FP values of the cores and the +memory data volume of the whole socket. The actual operational intensity for +multiple CPUs can be found in the statistics table in the Sum column. diff --git a/collectors/likwid/groups/sandybridgeEP/MEM_SP.txt b/collectors/likwid/groups/sandybridgeEP/MEM_SP.txt new file mode 100644 index 0000000..955cdc4 --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/MEM_SP.txt @@ -0,0 +1,66 @@ +SHORT Overview of arithmetic and main memory performance + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY +PWR3 PWR_DRAM_ENERGY +PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE +PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE +PMC2 SIMD_FP_256_PACKED_SINGLE +MBOX0C0 CAS_COUNT_RD +MBOX0C1 CAS_COUNT_WR +MBOX1C0 CAS_COUNT_RD +MBOX1C1 CAS_COUNT_WR +MBOX2C0 CAS_COUNT_RD +MBOX2C1 CAS_COUNT_WR +MBOX3C0 CAS_COUNT_RD +MBOX3C1 CAS_COUNT_WR + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time +MFLOP/s 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time +AVX [MFLOP/s] 1.0E-06*(PMC2*8.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0 +Operational intensity (PMC0*4.0+PMC1+PMC2*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0) + +LONG +Formulas: +Power [W] = PWR_PKG_ENERGY/runtime +Power DRAM [W] = PWR_DRAM_ENERGY/runtime +MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime +AVX [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time +Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time +Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time +Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0 +Operational intensity = (FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/((SUM(MBOXxC0)+SUM(MBOXxC1))*64.0) +-- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Since this group is based on Uncore events it is only possible to measure on +a per socket base. Also outputs total data volume transferred from main memory. +SSE scalar and packed single precision FLOP rates. Also reports on packed AVX +32b instructions. Please note that the current FLOP measurements on SandyBridge +are potentially wrong. So you cannot trust these counters at the moment! +The operational intensity is calculated using the FP values of the cores and the +memory data volume of the whole socket. The actual operational intensity for +multiple CPUs can be found in the statistics table in the Sum column. diff --git a/collectors/likwid/groups/sandybridgeEP/NUMA.txt b/collectors/likwid/groups/sandybridgeEP/NUMA.txt new file mode 100644 index 0000000..41fbe62 --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/NUMA.txt @@ -0,0 +1,33 @@ +SHORT Local and remote memory accesses + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 OFFCORE_RESPONSE_0_LOCAL_DRAM +PMC1 OFFCORE_RESPONSE_1_REMOTE_DRAM + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Local DRAM data volume [GByte] 1.E-09*PMC0*64 +Local DRAM bandwidth [MByte/s] 1.E-06*(PMC0*64)/time +Remote DRAM data volume [GByte] 1.E-09*PMC1*64 +Remote DRAM bandwidth [MByte/s] 1.E-06*(PMC1*64)/time +Memory data volume [GByte] 1.E-09*(PMC0+PMC1)*64 +Memory bandwidth [MByte/s] 1.E-06*((PMC0+PMC1)*64)/time + +LONG +Formulas: +CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY +Local DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_0_LOCAL_DRAM*64 +Local DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_0_LOCAL_DRAM*64)/time +Remote DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_1_REMOTE_DRAM*64 +Remote DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_1_REMOTE_DRAM*64)/time +Memory data volume [GByte] = 1.E-09*(OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64 +Memory bandwidth [MByte/s] = 1.E-06*((OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64)/time +-- +This performance group measures the data traffic of CPU cores to local and remote +memory. diff --git a/collectors/likwid/groups/sandybridgeEP/PORT_USAGE.txt b/collectors/likwid/groups/sandybridgeEP/PORT_USAGE.txt new file mode 100644 index 0000000..d509607 --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/PORT_USAGE.txt @@ -0,0 +1,40 @@ +SHORT Execution port utilization + +REQUIRE_NOHT + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_DISPATCHED_PORT_PORT_0 +PMC1 UOPS_DISPATCHED_PORT_PORT_1 +PMC2 UOPS_DISPATCHED_PORT_PORT_2 +PMC3 UOPS_DISPATCHED_PORT_PORT_3 +PMC4 UOPS_DISPATCHED_PORT_PORT_4 +PMC5 UOPS_DISPATCHED_PORT_PORT_5 + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Port0 usage ratio PMC0/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5) +Port1 usage ratio PMC1/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5) +Port2 usage ratio PMC2/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5) +Port3 usage ratio PMC3/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5) +Port4 usage ratio PMC4/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5) +Port5 usage ratio PMC5/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5) + +LONG +Formulas: +Port0 usage ratio = UOPS_DISPATCHED_PORT_PORT_0/SUM(UOPS_DISPATCHED_PORT_PORT_*) +Port1 usage ratio = UOPS_DISPATCHED_PORT_PORT_1/SUM(UOPS_DISPATCHED_PORT_PORT_*) +Port2 usage ratio = UOPS_DISPATCHED_PORT_PORT_2/SUM(UOPS_DISPATCHED_PORT_PORT_*) +Port3 usage ratio = UOPS_DISPATCHED_PORT_PORT_3/SUM(UOPS_DISPATCHED_PORT_PORT_*) +Port4 usage ratio = UOPS_DISPATCHED_PORT_PORT_4/SUM(UOPS_DISPATCHED_PORT_PORT_*) +Port5 usage ratio = UOPS_DISPATCHED_PORT_PORT_5/SUM(UOPS_DISPATCHED_PORT_PORT_*) +- +This group measures the execution port utilization in a CPU core. The group can +only be measured when HyperThreading is disabled because only then each CPU core +can program eight counters. diff --git a/collectors/likwid/groups/sandybridgeEP/QPI.txt b/collectors/likwid/groups/sandybridgeEP/QPI.txt new file mode 100644 index 0000000..320614f --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/QPI.txt @@ -0,0 +1,35 @@ +SHORT QPI traffic between sockets + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +SBOX0C0 DIRECT2CORE_SUCCESS +SBOX0C1 RXL_FLITS_G1_DRS_DATA +SBOX0C2 RXL_FLITS_G2_NCB_DATA +SBOX1C0 DIRECT2CORE_SUCCESS +SBOX1C1 RXL_FLITS_G1_DRS_DATA +SBOX1C2 RXL_FLITS_G2_NCB_DATA + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Received bandwidth from QPI [MBytes/s] 1.0E-06*(SBOX0C1+SBOX0C2+SBOX1C1+SBOX1C2)*8/time +Received data volume from QPI [GBytes] 1.0E-09*(SBOX0C1+SBOX0C2+SBOX1C1+SBOX1C2)*8 +Bandwidth QPI to LLC [MBytes/s] 1.0E-06*(SBOX0C0+SBOX1C0)*64/time +Data volume QPI to LLC [GBytes] 1.0E-09*(SBOX0C0+SBOX1C0)*64 +Bandwidth QPI to HA or IIO [MBytes/s] 1.0E-06*(((SBOX0C1+SBOX0C2+SBOX1C1+SBOX1C2)*8)-((SBOX0C0+SBOX1C0)*64))/time +Data volume QPI to HA or IIO [GBytes] 1.0E-09*(((SBOX0C1+SBOX0C2+SBOX1C1+SBOX1C2)*8)-((SBOX0C0+SBOX1C0)*64)) + +LONG +Formulas: +Received bandwidth from QPI [MBytes/s] = 1.0E-06*(sum(RXL_FLITS_G1_DRS_DATA)+sum(RXL_FLITS_G2_NCB_DATA))*8/time +Received data volume from QPI [GBytes] = 1.0E-09*(sum(RXL_FLITS_G1_DRS_DATA)+sum(RXL_FLITS_G2_NCB_DATA))*8 +Bandwidth QPI to LLC [MBytes/s] = 1.0E-06*(sum(DIRECT2CORE_SUCCESS))*64/time +Data volume QPI to LLC [GBytes] = 1.0E-09*(sum(DIRECT2CORE_SUCCESS))*64 +Bandwidth QPI to HA or IIO [MBytes/s] = 1.0E-06*(((sum(RXL_FLITS_G1_DRS_DATA)+sum(RXL_FLITS_G2_NCB_DATA))*8)-((sum(DIRECT2CORE_SUCCESS))*64))/time +Data volume QPI to HA or IIO [GBytes] = 1.0E-09*(((sum(RXL_FLITS_G1_DRS_DATA)+sum(RXL_FLITS_G2_NCB_DATA))*8)-((sum(DIRECT2CORE_SUCCESS))*64)) +- +Profiling group to measure traffic on the QPI. diff --git a/collectors/likwid/groups/sandybridgeEP/RECOVERY.txt b/collectors/likwid/groups/sandybridgeEP/RECOVERY.txt new file mode 100644 index 0000000..7928ee4 --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/RECOVERY.txt @@ -0,0 +1,22 @@ +SHORT Recovery duration + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 INT_MISC_RECOVERY_CYCLES +PMC1 INT_MISC_RECOVERY_COUNT + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Avg. recovery duration PMC0/PMC1 + +LONG +Formulas: +Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT +- +This group measures the duration of recoveries after SSE exception, memory +disambiguation, etc... diff --git a/collectors/likwid/groups/sandybridgeEP/TLB_DATA.txt b/collectors/likwid/groups/sandybridgeEP/TLB_DATA.txt new file mode 100644 index 0000000..8d94e05 --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/TLB_DATA.txt @@ -0,0 +1,35 @@ +SHORT L2 data TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 DTLB_LOAD_MISSES_CAUSES_A_WALK +PMC1 DTLB_STORE_MISSES_CAUSES_A_WALK +PMC2 DTLB_LOAD_MISSES_WALK_DURATION +PMC3 DTLB_STORE_MISSES_WALK_DURATION + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 DTLB load misses PMC0 +L1 DTLB load miss rate PMC0/FIXC0 +L1 DTLB load miss duration [Cyc] PMC2/PMC0 +L1 DTLB store misses PMC1 +L1 DTLB store miss rate PMC1/FIXC0 +L1 DTLB store miss duration [Cyc] PMC3/PMC1 + +LONG +Formulas: +L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK +L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK +L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK +L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK +- +The DTLB load and store miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/sandybridgeEP/TLB_INSTR.txt b/collectors/likwid/groups/sandybridgeEP/TLB_INSTR.txt new file mode 100644 index 0000000..235d977 --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/TLB_INSTR.txt @@ -0,0 +1,28 @@ +SHORT L1 Instruction TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ITLB_MISSES_CAUSES_A_WALK +PMC1 ITLB_MISSES_WALK_DURATION + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 ITLB misses PMC0 +L1 ITLB miss rate PMC0/FIXC0 +L1 ITLB miss duration [Cyc] PMC1/PMC0 + + +LONG +Formulas: +L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK +L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK +- +The ITLB miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/sandybridgeEP/TMA.txt b/collectors/likwid/groups/sandybridgeEP/TMA.txt new file mode 100644 index 0000000..afb4126 --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/TMA.txt @@ -0,0 +1,48 @@ +SHORT Top down cycle allocation + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_ANY +PMC1 UOPS_RETIRED_RETIRE_SLOTS +PMC2 IDQ_UOPS_NOT_DELIVERED_CORE +PMC3 INT_MISC_RECOVERY_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +IPC FIXC0/FIXC1 +Total Slots 4*FIXC1 +Slots Retired PMC1 +Fetch Bubbles PMC2 +Recovery Bubbles 4*PMC3 +Front End [%] PMC2/(4*FIXC1)*100 +Speculation [%] (PMC0-PMC1+(4*PMC3))/(4*FIXC1)*100 +Retiring [%] PMC1/(4*FIXC1)*100 +Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100 + +LONG +Formulas: +Total Slots = 4*CPU_CLK_UNHALTED_CORE +Slots Retired = UOPS_RETIRED_RETIRE_SLOTS +Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE +Recovery Bubbles = 4*INT_MISC_RECOVERY_CYCLES +Front End [%] = IDQ_UOPS_NOT_DELIVERED_CORE/(4*CPU_CLK_UNHALTED_CORE)*100 +Speculation [%] = (UOPS_ISSUED_ANY-UOPS_RETIRED_RETIRE_SLOTS+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)*100 +Retiring [%] = UOPS_RETIRED_RETIRE_SLOTS/(4*CPU_CLK_UNHALTED_CORE)*100 +Back End [%] = (1-((IDQ_UOPS_NOT_DELIVERED_CORE+UOPS_ISSUED_ANY+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)))*100 +-- +This performance group measures cycles to determine percentage of time spent in +front end, back end, retiring and speculation. These metrics are published and +verified by Intel. Further information: +Webpage describing Top-Down Method and its usage in Intel vTune: +https://software.intel.com/en-us/vtune-amplifier-help-tuning-applications-using-a-top-down-microarchitecture-analysis-method +Paper by Yasin Ahmad: +https://sites.google.com/site/analysismethods/yasin-pubs/TopDown-Yasin-ISPASS14.pdf?attredirects=0 +Slides by Yasin Ahmad: +http://www.cs.technion.ac.il/~erangi/TMA_using_Linux_perf__Ahmad_Yasin.pdf +The performance group was originally published here: +http://perf.mvermeulen.com/2018/04/14/top-down-performance-counter-analysis-part-1-likwid/ diff --git a/collectors/likwid/groups/sandybridgeEP/UOPS.txt b/collectors/likwid/groups/sandybridgeEP/UOPS.txt new file mode 100644 index 0000000..a4d35d8 --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/UOPS.txt @@ -0,0 +1,32 @@ +SHORT UOPs execution info + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_ANY +PMC1 UOPS_EXECUTED_THREAD +PMC2 UOPS_RETIRED_ALL + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Issued UOPs PMC0 +Executed UOPs PMC1 +Retired UOPs PMC2 + +LONG +Formulas: +Issued UOPs = UOPS_ISSUED_ANY +Executed UOPs = UOPS_EXECUTED_THREAD +Retired UOPs = UOPS_RETIRED_ALL +- +This group returns information about the instruction pipeline. It measures the +issued, executed and retired uOPs and returns the number of uOPs which were issued +but not executed as well as the number of uOPs which were executed but never retired. +The executed but not retired uOPs commonly come from speculatively executed branches. + diff --git a/collectors/likwid/groups/sandybridgeEP/UOPS_EXEC.txt b/collectors/likwid/groups/sandybridgeEP/UOPS_EXEC.txt new file mode 100644 index 0000000..7042df7 --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/UOPS_EXEC.txt @@ -0,0 +1,31 @@ +SHORT UOPs execution + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_EXECUTED_USED_CYCLES +PMC1 UOPS_EXECUTED_STALL_CYCLES +PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES +PMC3:EDGEDETECT UOPS_EXECUTED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio [%] 100*PMC0/PMC2 +Unused cycles ratio [%] 100*PMC1/PMC2 +Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT + + +LONG +Formulas: +Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT +- +This performance group returns the ratios of used and unused cycles regarding +the execution stage in the pipeline. Used cycles are all cycles where uOPs are +executed while unused cycles refer to pipeline stalls. Moreover, the group +calculates the average stall duration in cycles. diff --git a/collectors/likwid/groups/sandybridgeEP/UOPS_ISSUE.txt b/collectors/likwid/groups/sandybridgeEP/UOPS_ISSUE.txt new file mode 100644 index 0000000..9aac923 --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/UOPS_ISSUE.txt @@ -0,0 +1,31 @@ +SHORT UOPs issueing + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_USED_CYCLES +PMC1 UOPS_ISSUED_STALL_CYCLES +PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES +PMC3:EDGEDETECT UOPS_ISSUED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio [%] 100*PMC0/PMC2 +Unused cycles ratio [%] 100*PMC1/PMC2 +Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT + + +LONG +Formulas: +Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT +- +This performance group returns the ratios of used and unused cycles regarding +the issue stage in the pipeline. Used cycles are all cycles where uOPs are +issued while unused cycles refer to pipeline stalls. Moreover, the group +calculates the average stall duration in cycles. diff --git a/collectors/likwid/groups/sandybridgeEP/UOPS_RETIRE.txt b/collectors/likwid/groups/sandybridgeEP/UOPS_RETIRE.txt new file mode 100644 index 0000000..0f37585 --- /dev/null +++ b/collectors/likwid/groups/sandybridgeEP/UOPS_RETIRE.txt @@ -0,0 +1,31 @@ +SHORT UOPs retirement + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_RETIRED_USED_CYCLES +PMC1 UOPS_RETIRED_STALL_CYCLES +PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES +PMC3:EDGEDETECT UOPS_RETIRED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio [%] 100*PMC0/PMC2 +Unused cycles ratio [%] 100*PMC1/PMC2 +Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT + + +LONG +Formulas: +Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT +- +This performance group returns the ratios of used and unused cycles regarding +the retirement stage in the pipeline (re-order buffer). Used cycles are all +cycles where uOPs are retired while unused cycles refer to pipeline stalls. +Moreover, the group calculates the average stall duration in cycles. diff --git a/collectors/likwid/groups/silvermont/BRANCH.txt b/collectors/likwid/groups/silvermont/BRANCH.txt new file mode 100644 index 0000000..b8d41b2 --- /dev/null +++ b/collectors/likwid/groups/silvermont/BRANCH.txt @@ -0,0 +1,31 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 BR_INST_RETIRED_ALL_BRANCHES +PMC1 BR_MISP_RETIRED_ALL_BRANCHES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Branch rate PMC0/FIXC0 +Branch misprediction rate PMC1/FIXC0 +Branch misprediction ratio PMC1/PMC0 +Instructions per branch FIXC0/PMC0 + +LONG +Formulas: +Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES +Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/silvermont/CLOCK.txt b/collectors/likwid/groups/silvermont/CLOCK.txt new file mode 100644 index 0000000..b2174c8 --- /dev/null +++ b/collectors/likwid/groups/silvermont/CLOCK.txt @@ -0,0 +1,23 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +- +Silvermont implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) level. + diff --git a/collectors/likwid/groups/silvermont/DATA.txt b/collectors/likwid/groups/silvermont/DATA.txt new file mode 100644 index 0000000..61a915b --- /dev/null +++ b/collectors/likwid/groups/silvermont/DATA.txt @@ -0,0 +1,22 @@ +SHORT Load to store ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_UOPS_RETIRED_ALL_LOADS +PMC1 MEM_UOPS_RETIRED_ALL_STORES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Load to store ratio PMC0/PMC1 + +LONG +Formulas: +Load to store ratio = MEM_UOPS_RETIRED_ALL_LOADS/MEM_UOPS_RETIRED_ALL_STORES +- +This is a metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/silvermont/DIVIDE.txt b/collectors/likwid/groups/silvermont/DIVIDE.txt new file mode 100644 index 0000000..f82fc59 --- /dev/null +++ b/collectors/likwid/groups/silvermont/DIVIDE.txt @@ -0,0 +1,24 @@ +SHORT Divide unit information + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLES_DIV_BUSY_ANY +PMC1:EDGEDETECT CYCLES_DIV_BUSY_ANY + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Number of divide ops PMC1:EDGEDETECT +Avg. divide unit usage duration PMC0/PMC1:EDGEDETECT + +LONG +Formulas: +Number of divide ops = CYCLES_DIV_BUSY_ANY:EDGEDETECT +Avg. divide unit usage duration = CYCLES_DIV_BUSY_ANY/CYCLES_DIV_BUSY_ANY:EDGEDETECT +- +This performance group measures the average latency of divide operations diff --git a/collectors/likwid/groups/silvermont/ENERGY.txt b/collectors/likwid/groups/silvermont/ENERGY.txt new file mode 100644 index 0000000..73939a3 --- /dev/null +++ b/collectors/likwid/groups/silvermont/ENERGY.txt @@ -0,0 +1,29 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +TMP0 TEMP_CORE +PWR0 PWR_PKG_ENERGY +PWR1 PWR_PP0_ENERGY + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Temperature [C] TMP0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy PP0 [J] PWR1 +Power PP0 [W] PWR1/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Power PP0 = PWR_PP0_ENERGY / time +- +Silvermont implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) level. + diff --git a/collectors/likwid/groups/silvermont/ICACHE.txt b/collectors/likwid/groups/silvermont/ICACHE.txt new file mode 100644 index 0000000..5f11ad6 --- /dev/null +++ b/collectors/likwid/groups/silvermont/ICACHE.txt @@ -0,0 +1,25 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ICACHE_ACCESSES +PMC1 ICACHE_MISSES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1I request rate PMC0/FIXC0 +L1I miss rate PMC1/FIXC0 +L1I miss ratio PMC1/PMC0 + +LONG +Formulas: +L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY +L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY +L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES +- +This group measures some L1 instruction cache metrics. diff --git a/collectors/likwid/groups/silvermont/L2CACHE.txt b/collectors/likwid/groups/silvermont/L2CACHE.txt new file mode 100644 index 0000000..32a1545 --- /dev/null +++ b/collectors/likwid/groups/silvermont/L2CACHE.txt @@ -0,0 +1,34 @@ +SHORT L2 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 LONGEST_LAT_CACHE_REFERENCE +PMC1 LONGEST_LAT_CACHE_MISS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 request rate PMC0/FIXC0 +L2 miss rate PMC1/FIXC0 +L2 miss ratio PMC1/PMC0 + +LONG +Formulas: +L2 request rate = LONGEST_LAT_CACHE_REFERENCE/INSTR_RETIRED_ANY +L2 miss rate = LONGEST_LAT_CACHE_MISS/INSTR_RETIRED_ANY +L2 miss ratio = LONGEST_LAT_CACHE_MISS/LONGEST_LAT_CACHE_REFERENCE +- +This group measures the locality of your data accesses with regard to the +L2 cache. L2 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L2 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L2 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache +reuse. + diff --git a/collectors/likwid/groups/silvermont/MEM.txt b/collectors/likwid/groups/silvermont/MEM.txt new file mode 100644 index 0000000..de78337 --- /dev/null +++ b/collectors/likwid/groups/silvermont/MEM.txt @@ -0,0 +1,37 @@ +SHORT Memory load bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 LONGEST_LAT_CACHE_MISS +PMC1 OFFCORE_RESPONSE_1_WB_ANY + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Memory read bandwidth [MBytes/s] 1.0E-06*(PMC0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(PMC0)*64.0 +Memory writeback bandwidth [MBytes/s] 1.0E-06*(PMC1)*64.0/time +Memory writeback data volume [GBytes] 1.0E-09*(PMC1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 + +LONG +Formulas: +Memory read bandwidth [MBytes/s] = 1.0E-06*(LONGEST_LAT_CACHE_MISS)*64/time +Memory read data volume [GBytes] = 1.0E-09*(LONGEST_LAT_CACHE_MISS)*64 +Memory writeback bandwidth [MBytes/s] = 1.0E-06*(OFFCORE_RESPONSE_1_WB_ANY)*64/time +Memory writeback data volume [GBytes] = 1.0E-09*(OFFCORE_RESPONSE_1_WB_ANY)*64 +Memory bandwidth [MBytes/s] = 1.0E-06*(LONGEST_LAT_CACHE_MISS+OFFCORE_RESPONSE_1_WB_ANY)*64/time +Memory data volume [GBytes] = 1.0E-09*(LONGEST_LAT_CACHE_MISS+OFFCORE_RESPONSE_1_WB_ANY)*64 +- +Profiling group to measure L2 to MEM load cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L2 cache. Since there is no possibility to retrieve +the evicted cache lines, this group measures only the load cache bandwidth. The +writeback metrics count only modified cache lines that are written back to go to +exclusive state +The group also output totally load and writeback data volume transferred between memory and L2. + diff --git a/collectors/likwid/groups/silvermont/TLB_DATA.txt b/collectors/likwid/groups/silvermont/TLB_DATA.txt new file mode 100644 index 0000000..5f2617f --- /dev/null +++ b/collectors/likwid/groups/silvermont/TLB_DATA.txt @@ -0,0 +1,27 @@ +SHORT L2 data TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 PAGE_WALKS_DTLB_COUNT +PMC1 PAGE_WALKS_DTLB_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 DTLB misses PMC0 +L1 DTLB miss rate PMC0/FIXC0 +L1 DTLB miss duration [Cyc] PMC1/PMC0 + +LONG +Formulas: +L1 DTLB misses = PAGE_WALKS_DTLB_COUNT +L1 DTLB miss rate = PAGE_WALKS_DTLB_COUNT / INSTR_RETIRED_ANY +L1 DTLB miss duration [Cyc] = PAGE_WALKS_DTLB_CYCLES / PAGE_WALKS_DTLB_COUNT +- +The DTLB load and store miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/silvermont/TLB_INSTR.txt b/collectors/likwid/groups/silvermont/TLB_INSTR.txt new file mode 100644 index 0000000..f3dd3ec --- /dev/null +++ b/collectors/likwid/groups/silvermont/TLB_INSTR.txt @@ -0,0 +1,27 @@ +SHORT L1 Instruction TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 PAGE_WALKS_ITLB_COUNT +PMC1 PAGE_WALKS_ITLB_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 ITLB misses PMC0 +L1 ITLB miss rate PMC0/FIXC0 +L1 ITLB miss duration [Cyc] PMC1/PMC0 + + +LONG +Formulas: +L1 ITLB misses = PAGE_WALKS_ITLB_COUNT +L1 ITLB miss rate = PAGE_WALKS_ITLB_COUNT / INSTR_RETIRED_ANY +L1 ITLB miss duration [Cyc] = PAGE_WALKS_ITLB_CYCLES / PAGE_WALKS_ITLB_COUNT +- +The ITLB miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. diff --git a/collectors/likwid/groups/skylake/BRANCH.txt b/collectors/likwid/groups/skylake/BRANCH.txt new file mode 100644 index 0000000..b8d41b2 --- /dev/null +++ b/collectors/likwid/groups/skylake/BRANCH.txt @@ -0,0 +1,31 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 BR_INST_RETIRED_ALL_BRANCHES +PMC1 BR_MISP_RETIRED_ALL_BRANCHES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Branch rate PMC0/FIXC0 +Branch misprediction rate PMC1/FIXC0 +Branch misprediction ratio PMC1/PMC0 +Instructions per branch FIXC0/PMC0 + +LONG +Formulas: +Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES +Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/skylake/CLOCK.txt b/collectors/likwid/groups/skylake/CLOCK.txt new file mode 100644 index 0000000..d682e3a --- /dev/null +++ b/collectors/likwid/groups/skylake/CLOCK.txt @@ -0,0 +1,30 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY +PWR3 PWR_DRAM_ENERGY +UBOXFIX UNCORE_CLOCK + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +Uncore Clock [MHz] 1.E-06*UBOXFIX/time +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Power DRAM = PWR_DRAM_ENERGY / time +Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time +- +Skylake implements the RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) and DRAM level. + diff --git a/collectors/likwid/groups/skylake/CYCLE_ACTIVITY.txt b/collectors/likwid/groups/skylake/CYCLE_ACTIVITY.txt new file mode 100644 index 0000000..c432a44 --- /dev/null +++ b/collectors/likwid/groups/skylake/CYCLE_ACTIVITY.txt @@ -0,0 +1,38 @@ +SHORT Cycle Activities + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING +PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING +PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING +PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Cycles without execution [%] (PMC3/FIXC1)*100 +Cycles without execution due to L1D [%] (PMC2/FIXC1)*100 +Cycles without execution due to L2 [%] (PMC0/FIXC1)*100 +Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100 + +LONG +Formulas: +Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100 +Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100 +Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100 +Cycles without execution due to memory loads [%] = CYCLE_ACTIVITY_CYCLES_LDM_PENDING/CPU_CLK_UNHALTED_CORE*100 +-- +This performance group measures the cycles while waiting for data from the cache +and memory hierarchy. +CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on +any execution port. +CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is +outstanding. +CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is +outstanding. +CYCLE_ACTIVITY_CYCLES_LDM_PENDING: Cycles while memory subsystem has an +outstanding load. diff --git a/collectors/likwid/groups/skylake/CYCLE_STALLS.txt b/collectors/likwid/groups/skylake/CYCLE_STALLS.txt new file mode 100644 index 0000000..795aeb9 --- /dev/null +++ b/collectors/likwid/groups/skylake/CYCLE_STALLS.txt @@ -0,0 +1,45 @@ +SHORT Cycle Activities (Stalls) + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING +PMC1 CYCLE_ACTIVITY_STALLS_LDM_PENDING +PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING +PMC3 CYCLE_ACTIVITY_STALLS_TOTAL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Total execution stalls PMC3 +Stalls caused by L1D misses [%] (PMC2/PMC3)*100 +Stalls caused by L2 misses [%] (PMC0/PMC3)*100 +Stalls caused by memory loads [%] (PMC1/PMC3)*100 +Execution stall rate [%] (PMC3/FIXC1)*100 +Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100 +Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100 +Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100 + +LONG +Formulas: +Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL +Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Stalls caused by memory loads [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by memory loads rate [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CPU_CLK_UNHALTED_CORE)*100 +-- +This performance group measures the stalls caused by data traffic in the cache +hierarchy. +CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls. +CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand +load is outstanding. +CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand +load is outstanding. +CYCLE_ACTIVITY_STALLS_LDM_PENDING: Execution stalls while memory subsystem has +an outstanding load. diff --git a/collectors/likwid/groups/skylake/DATA.txt b/collectors/likwid/groups/skylake/DATA.txt new file mode 100644 index 0000000..4e6e938 --- /dev/null +++ b/collectors/likwid/groups/skylake/DATA.txt @@ -0,0 +1,22 @@ +SHORT Load to store ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_INST_RETIRED_ALL_LOADS +PMC1 MEM_INST_RETIRED_ALL_STORES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Load to store ratio PMC0/PMC1 + +LONG +Formulas: +Load to store ratio = MEM_INST_RETIRED_ALL_LOADS/MEM_INST_RETIRED_ALL_STORES +- +This is a metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/skylake/DIVIDE.txt b/collectors/likwid/groups/skylake/DIVIDE.txt new file mode 100644 index 0000000..40b4ab6 --- /dev/null +++ b/collectors/likwid/groups/skylake/DIVIDE.txt @@ -0,0 +1,24 @@ +SHORT Divide unit information + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ARITH_DIVIDER_COUNT +PMC1 ARITH_DIVIDER_ACTIVE + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Number of divide ops PMC0 +Avg. divide unit usage duration PMC1/PMC0 + +LONG +Formulas: +Number of divide ops = ARITH_DIVIDER_COUNT +Avg. divide unit usage duration = ARITH_DIVIDER_ACTIVE/ARITH_DIVIDER_COUNT +- +This performance group measures the average latency of divide operations diff --git a/collectors/likwid/groups/skylake/ENERGY.txt b/collectors/likwid/groups/skylake/ENERGY.txt new file mode 100644 index 0000000..07dbda5 --- /dev/null +++ b/collectors/likwid/groups/skylake/ENERGY.txt @@ -0,0 +1,39 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +TMP0 TEMP_CORE +PWR0 PWR_PKG_ENERGY +PWR1 PWR_PP0_ENERGY +PWR2 PWR_PP1_ENERGY +PWR3 PWR_DRAM_ENERGY + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Temperature [C] TMP0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy PP0 [J] PWR1 +Power PP0 [W] PWR1/time +Energy PP1 [J] PWR2 +Power PP1 [W] PWR2/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Power PP0 = PWR_PP0_ENERGY / time +Power PP1 = PWR_PP1_ENERGY / time +Power DRAM = PWR_DRAM_ENERGY / time +- +Skylake implements the RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) and DRAM level. + diff --git a/collectors/likwid/groups/skylake/FALSE_SHARE.txt b/collectors/likwid/groups/skylake/FALSE_SHARE.txt new file mode 100644 index 0000000..65ff4d4 --- /dev/null +++ b/collectors/likwid/groups/skylake/FALSE_SHARE.txt @@ -0,0 +1,25 @@ +SHORT False sharing + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_LOAD_L3_HIT_RETIRED_XSNP_HITM +PMC2 MEM_INST_RETIRED_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Local LLC false sharing [MByte] 1.E-06*PMC0*64 +Local LLC false sharing rate PMC0/PMC2 + +LONG +Formulas: +Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_L3_HIT_RETIRED_XSNP_HITM*64 +Local LLC false sharing rate = MEM_LOAD_L3_HIT_RETIRED_XSNP_HITM/MEM_INST_RETIRED_ALL +- +False-sharing of cache lines can dramatically reduce the performance of an +application. This performance group measures the L3 traffic induced by false-sharing. +The false-sharing rate uses all memory loads as reference. diff --git a/collectors/likwid/groups/skylake/FLOPS_AVX.txt b/collectors/likwid/groups/skylake/FLOPS_AVX.txt new file mode 100644 index 0000000..ebde747 --- /dev/null +++ b/collectors/likwid/groups/skylake/FLOPS_AVX.txt @@ -0,0 +1,24 @@ +SHORT Packed AVX MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE +PMC1 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Packed SP [MFLOP/s] 1.0E-06*(PMC0*8.0)/time +Packed DP [MFLOP/s] 1.0E-06*(PMC1*4.0)/time + +LONG +Formulas: +Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime +Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime +- +Packed 32b AVX FLOPs rates. + diff --git a/collectors/likwid/groups/skylake/FLOPS_DP.txt b/collectors/likwid/groups/skylake/FLOPS_DP.txt new file mode 100644 index 0000000..ff7a833 --- /dev/null +++ b/collectors/likwid/groups/skylake/FLOPS_DP.txt @@ -0,0 +1,31 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time +AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) + +LONG +Formulas: +DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime +AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime +Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE) +- +SSE scalar and packed double precision FLOP rates. + diff --git a/collectors/likwid/groups/skylake/FLOPS_SP.txt b/collectors/likwid/groups/skylake/FLOPS_SP.txt new file mode 100644 index 0000000..3a7d56b --- /dev/null +++ b/collectors/likwid/groups/skylake/FLOPS_SP.txt @@ -0,0 +1,31 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time +AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) + +LONG +Formulas: +SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime +AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime +Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE) +- +SSE scalar and packed single precision FLOP rates. + diff --git a/collectors/likwid/groups/skylake/ICACHE.txt b/collectors/likwid/groups/skylake/ICACHE.txt new file mode 100644 index 0000000..aab7dac --- /dev/null +++ b/collectors/likwid/groups/skylake/ICACHE.txt @@ -0,0 +1,30 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ICACHE_64B_IFTAG_ALL +PMC1 ICACHE_64B_IFTAG_MISS +PMC2 ICACHE_64B_IFTAG_STALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1I request rate PMC0/FIXC0 +L1I miss rate PMC1/FIXC0 +L1I miss ratio PMC1/PMC0 +L1I stalls PMC2 +L1I stall rate PMC2/FIXC0 + +LONG +Formulas: +L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY +L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY +L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES +L1I stalls = ICACHE_IFETCH_STALL +L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY +- +This group measures some L1 instruction cache metrics. diff --git a/collectors/likwid/groups/skylake/L2.txt b/collectors/likwid/groups/skylake/L2.txt new file mode 100644 index 0000000..1a92a95 --- /dev/null +++ b/collectors/likwid/groups/skylake/L2.txt @@ -0,0 +1,38 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPLACEMENT +PMC1 L1D_M_EVICT +PMC2 ICACHE_64B_IFTAG_MISS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2D load data volume [GBytes] 1.0E-09*PMC0*64.0 +L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time +L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0 + +LONG +Formulas: +L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time +L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0 +L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time +L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0 +L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_64B_IFTAG_MISS)*64/time +L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_64B_IFTAG_MISS)*64 +- +Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L1 and the number of modified cache lines +evicted from the L1. The group also output total data volume transferred between +L2 and L1. Note that this bandwidth also includes data transfers due to a write +allocate load on a store miss in L1 and traffic caused by misses in the +L1 instruction cache. + diff --git a/collectors/likwid/groups/skylake/L2CACHE.txt b/collectors/likwid/groups/skylake/L2CACHE.txt new file mode 100644 index 0000000..fbc3745 --- /dev/null +++ b/collectors/likwid/groups/skylake/L2CACHE.txt @@ -0,0 +1,34 @@ +SHORT L2 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_TRANS_ALL_REQUESTS +PMC1 L2_RQSTS_MISS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 request rate PMC0/FIXC0 +L2 miss rate PMC1/FIXC0 +L2 miss ratio PMC1/PMC0 + +LONG +Formulas: +L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY +L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY +L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS +- +This group measures the locality of your data accesses with regard to the +L2 cache. L2 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L2 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L2 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/skylake/L3.txt b/collectors/likwid/groups/skylake/L3.txt new file mode 100644 index 0000000..f63a918 --- /dev/null +++ b/collectors/likwid/groups/skylake/L3.txt @@ -0,0 +1,36 @@ +SHORT L3 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_LINES_IN_ALL +PMC1 L2_TRANS_L2_WB + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L3 load data volume [GBytes] 1.0E-09*PMC0*64.0 +L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 + +LONG +Formulas: +L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time +L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0 +L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time +L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0 +L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time +L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64 +- +Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L2 and the number of modified cache lines +evicted from the L2. This group also output data volume transferred between the +L3 and measured cores L2 caches. Note that this bandwidth also includes data +transfers due to a write allocate load on a store miss in L2. + diff --git a/collectors/likwid/groups/skylake/L3CACHE.txt b/collectors/likwid/groups/skylake/L3CACHE.txt new file mode 100644 index 0000000..94953ef --- /dev/null +++ b/collectors/likwid/groups/skylake/L3CACHE.txt @@ -0,0 +1,35 @@ +SHORT L3 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_LOAD_RETIRED_L3_HIT +PMC1 MEM_LOAD_RETIRED_L3_MISS +PMC2 UOPS_RETIRED_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 request rate (PMC0+PMC1)/PMC2 +L3 miss rate PMC1/PMC2 +L3 miss ratio PMC1/(PMC0+PMC1) + +LONG +Formulas: +L3 request rate = (MEM_LOAD_RETIRED_L3_HIT+MEM_LOAD_RETIRED_L3_MISS)/UOPS_RETIRED_ALL +L3 miss rate = MEM_LOAD_RETIRED_L3_MISS/UOPS_RETIRED_ALL +L3 miss ratio = MEM_LOAD_RETIRED_L3_MISS/(MEM_LOAD_RETIRED_L3_HIT+MEM_LOAD_RETIRED_L3_MISS) +- +This group measures the locality of your data accesses with regard to the +L3 cache. L3 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L3 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L3 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/skylake/MEM.txt b/collectors/likwid/groups/skylake/MEM.txt new file mode 100644 index 0000000..3a12df7 --- /dev/null +++ b/collectors/likwid/groups/skylake/MEM.txt @@ -0,0 +1,36 @@ +SHORT L3 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +MBOX0C1 DRAM_READS +MBOX0C2 DRAM_WRITES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Memory load bandwidth [MBytes/s] 1.0E-06*MBOX0C1*64.0/time +Memory load data volume [GBytes] 1.0E-09*MBOX0C1*64.0 +Memory evict bandwidth [MBytes/s] 1.0E-06*MBOX0C2*64.0/time +Memory evict data volume [GBytes] 1.0E-09*MBOX0C2*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX0C2)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX0C2)*64.0 + +LONG +Formulas: +L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time +L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0 +L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time +L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0 +L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time +L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64 +- +Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L2 and the number of modified cache lines +evicted from the L2. This group also output data volume transferred between the +L3 and measured cores L2 caches. Note that this bandwidth also includes data +transfers due to a write allocate load on a store miss in L2. + diff --git a/collectors/likwid/groups/skylake/MEM_DP.txt b/collectors/likwid/groups/skylake/MEM_DP.txt new file mode 100644 index 0000000..14a359a --- /dev/null +++ b/collectors/likwid/groups/skylake/MEM_DP.txt @@ -0,0 +1,59 @@ +SHORT Overview of arithmetic and main memory performance + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY +PWR3 PWR_DRAM_ENERGY +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE +MBOX0C1 DRAM_READS +MBOX0C2 DRAM_WRITES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time +DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time +AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Memory load bandwidth [MBytes/s] 1.0E-06*MBOX0C1*64.0/time +Memory load data volume [GBytes] 1.0E-09*MBOX0C1*64.0 +Memory evict bandwidth [MBytes/s] 1.0E-06*MBOX0C2*64.0/time +Memory evict data volume [GBytes] 1.0E-09*MBOX0C2*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX0C2)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX0C2)*64.0 +Operational intensity (PMC0*2.0+PMC1+PMC2*4.0)/((MBOX0C1+MBOX0C2)*64.0) + +LONG +Formulas: +Power [W] = PWR_PKG_ENERGY/runtime +Power DRAM [W] = PWR_DRAM_ENERGY/runtime +DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime +AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime +Memory read bandwidth [MBytes/s] = 1.0E-06*DRAM_READS*64.0/runtime +Memory read data volume [GBytes] = 1.0E-09*DRAM_READS*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*DRAM_WRITES*64.0/runtime +Memory write data volume [GBytes] = 1.0E-09*DRAM_WRITES*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(DRAM_READS+DRAM_WRITES)*64.0/runtime +Memory data volume [GBytes] = 1.0E-09*(DRAM_READS+DRAM_WRITES)*64.0 +Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/(DRAM_READS+DRAM_WRITES)*64.0) +-- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Since this group is based on Uncore events it is only possible to measure on +a per socket base. Also outputs total data volume transferred from main memory. +SSE scalar and packed double precision FLOP rates. Also reports on packed AVX +32b instructions. +The operational intensity is calculated using the FP values of the cores and the +memory data volume of the whole socket. The actual operational intensity for +multiple CPUs can be found in the statistics table in the Sum column. diff --git a/collectors/likwid/groups/skylake/MEM_SP.txt b/collectors/likwid/groups/skylake/MEM_SP.txt new file mode 100644 index 0000000..0b47052 --- /dev/null +++ b/collectors/likwid/groups/skylake/MEM_SP.txt @@ -0,0 +1,59 @@ +SHORT L3 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY +PWR3 PWR_DRAM_ENERGY +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE +MBOX0C1 DRAM_READS +MBOX0C2 DRAM_WRITES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time +SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time +AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Memory load bandwidth [MBytes/s] 1.0E-06*MBOX0C1*64.0/time +Memory load data volume [GBytes] 1.0E-09*MBOX0C1*64.0 +Memory evict bandwidth [MBytes/s] 1.0E-06*MBOX0C2*64.0/time +Memory evict data volume [GBytes] 1.0E-09*MBOX0C2*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX0C2)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX0C2)*64.0 +Operational intensity (PMC0*4.0+PMC1+PMC2*8.0)/((MBOX0C1+MBOX0C2)*64.0) + +LONG +Formulas: +Power [W] = PWR_PKG_ENERGY/runtime +Power DRAM [W] = PWR_DRAM_ENERGY/runtime +DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*8)/runtime +AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*8)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime +Memory read bandwidth [MBytes/s] = 1.0E-06*DRAM_READS*64.0/runtime +Memory read data volume [GBytes] = 1.0E-09*DRAM_READS*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*DRAM_WRITES*64.0/runtime +Memory write data volume [GBytes] = 1.0E-09*DRAM_WRITES*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(DRAM_READS+DRAM_WRITES)*64.0/runtime +Memory data volume [GBytes] = 1.0E-09*(DRAM_READS+DRAM_WRITES)*64.0 +Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*8)/(DRAM_READS+DRAM_WRITES)*64.0) +- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Since this group is based on Uncore events it is only possible to measure on +a per socket base. Also outputs total data volume transferred from main memory. +SSE scalar and packed single precision FLOP rates. Also reports on packed AVX +32b instructions. +The operational intensity is calculated using the FP values of the cores and the +memory data volume of the whole socket. The actual operational intensity for +multiple CPUs can be found in the statistics table in the Sum column. diff --git a/collectors/likwid/groups/skylake/PORT_USAGE.txt b/collectors/likwid/groups/skylake/PORT_USAGE.txt new file mode 100644 index 0000000..eca8f2a --- /dev/null +++ b/collectors/likwid/groups/skylake/PORT_USAGE.txt @@ -0,0 +1,46 @@ +SHORT Execution port utilization + +REQUIRE_NOHT + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_DISPATCHED_PORT_PORT_0 +PMC1 UOPS_DISPATCHED_PORT_PORT_1 +PMC2 UOPS_DISPATCHED_PORT_PORT_2 +PMC3 UOPS_DISPATCHED_PORT_PORT_3 +PMC4 UOPS_DISPATCHED_PORT_PORT_4 +PMC5 UOPS_DISPATCHED_PORT_PORT_5 +PMC6 UOPS_DISPATCHED_PORT_PORT_6 +PMC7 UOPS_DISPATCHED_PORT_PORT_7 + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Port0 usage ratio PMC0/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port1 usage ratio PMC1/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port2 usage ratio PMC2/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port3 usage ratio PMC3/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port4 usage ratio PMC4/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port5 usage ratio PMC5/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port6 usage ratio PMC6/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) +Port7 usage ratio PMC7/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7) + +LONG +Formulas: +Port0 usage ratio = UOPS_DISPATCHED_PORT_PORT_0/SUM(UOPS_DISPATCHED_PORT_PORT_*) +Port1 usage ratio = UOPS_DISPATCHED_PORT_PORT_1/SUM(UOPS_DISPATCHED_PORT_PORT_*) +Port2 usage ratio = UOPS_DISPATCHED_PORT_PORT_2/SUM(UOPS_DISPATCHED_PORT_PORT_*) +Port3 usage ratio = UOPS_DISPATCHED_PORT_PORT_3/SUM(UOPS_DISPATCHED_PORT_PORT_*) +Port4 usage ratio = UOPS_DISPATCHED_PORT_PORT_4/SUM(UOPS_DISPATCHED_PORT_PORT_*) +Port5 usage ratio = UOPS_DISPATCHED_PORT_PORT_5/SUM(UOPS_DISPATCHED_PORT_PORT_*) +Port6 usage ratio = UOPS_DISPATCHED_PORT_PORT_6/SUM(UOPS_DISPATCHED_PORT_PORT_*) +Port7 usage ratio = UOPS_DISPATCHED_PORT_PORT_7/SUM(UOPS_DISPATCHED_PORT_PORT_*) +- +This group measures the execution port utilization in a CPU core. The group can +only be measured when HyperThreading is disabled because only then each CPU core +can program eight counters. diff --git a/collectors/likwid/groups/skylake/RECOVERY.txt b/collectors/likwid/groups/skylake/RECOVERY.txt new file mode 100644 index 0000000..7928ee4 --- /dev/null +++ b/collectors/likwid/groups/skylake/RECOVERY.txt @@ -0,0 +1,22 @@ +SHORT Recovery duration + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 INT_MISC_RECOVERY_CYCLES +PMC1 INT_MISC_RECOVERY_COUNT + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Avg. recovery duration PMC0/PMC1 + +LONG +Formulas: +Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT +- +This group measures the duration of recoveries after SSE exception, memory +disambiguation, etc... diff --git a/collectors/likwid/groups/skylake/TLB_DATA.txt b/collectors/likwid/groups/skylake/TLB_DATA.txt new file mode 100644 index 0000000..10ee5e1 --- /dev/null +++ b/collectors/likwid/groups/skylake/TLB_DATA.txt @@ -0,0 +1,35 @@ +SHORT L2 data TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 DTLB_LOAD_MISSES_CAUSES_A_WALK +PMC1 DTLB_STORE_MISSES_CAUSES_A_WALK +PMC2 DTLB_LOAD_MISSES_WALK_ACTIVE +PMC3 DTLB_STORE_MISSES_WALK_ACTIVE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 DTLB load misses PMC0 +L1 DTLB load miss rate PMC0/FIXC0 +L1 DTLB load miss duration [Cyc] PMC2/PMC0 +L1 DTLB store misses PMC1 +L1 DTLB store miss rate PMC1/FIXC0 +L1 DTLB store miss duration [Cyc] PMC3/PMC1 + +LONG +Formulas: +L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK +L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_ACTIVE / DTLB_LOAD_MISSES_CAUSES_A_WALK +L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK +L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_ACTIVE / DTLB_STORE_MISSES_CAUSES_A_WALK +- +The DTLB load and store miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/skylake/TLB_INSTR.txt b/collectors/likwid/groups/skylake/TLB_INSTR.txt new file mode 100644 index 0000000..9bc65a7 --- /dev/null +++ b/collectors/likwid/groups/skylake/TLB_INSTR.txt @@ -0,0 +1,28 @@ +SHORT L1 Instruction TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ITLB_MISSES_CAUSES_A_WALK +PMC1 ITLB_MISSES_WALK_ACTIVE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 ITLB misses PMC0 +L1 ITLB miss rate PMC0/FIXC0 +L1 ITLB miss duration [Cyc] PMC1/PMC0 + + +LONG +Formulas: +L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK +L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_ACTIVE / ITLB_MISSES_CAUSES_A_WALK +- +The ITLB miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/skylake/TMA.txt b/collectors/likwid/groups/skylake/TMA.txt new file mode 100644 index 0000000..afb4126 --- /dev/null +++ b/collectors/likwid/groups/skylake/TMA.txt @@ -0,0 +1,48 @@ +SHORT Top down cycle allocation + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_ANY +PMC1 UOPS_RETIRED_RETIRE_SLOTS +PMC2 IDQ_UOPS_NOT_DELIVERED_CORE +PMC3 INT_MISC_RECOVERY_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +IPC FIXC0/FIXC1 +Total Slots 4*FIXC1 +Slots Retired PMC1 +Fetch Bubbles PMC2 +Recovery Bubbles 4*PMC3 +Front End [%] PMC2/(4*FIXC1)*100 +Speculation [%] (PMC0-PMC1+(4*PMC3))/(4*FIXC1)*100 +Retiring [%] PMC1/(4*FIXC1)*100 +Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100 + +LONG +Formulas: +Total Slots = 4*CPU_CLK_UNHALTED_CORE +Slots Retired = UOPS_RETIRED_RETIRE_SLOTS +Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE +Recovery Bubbles = 4*INT_MISC_RECOVERY_CYCLES +Front End [%] = IDQ_UOPS_NOT_DELIVERED_CORE/(4*CPU_CLK_UNHALTED_CORE)*100 +Speculation [%] = (UOPS_ISSUED_ANY-UOPS_RETIRED_RETIRE_SLOTS+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)*100 +Retiring [%] = UOPS_RETIRED_RETIRE_SLOTS/(4*CPU_CLK_UNHALTED_CORE)*100 +Back End [%] = (1-((IDQ_UOPS_NOT_DELIVERED_CORE+UOPS_ISSUED_ANY+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)))*100 +-- +This performance group measures cycles to determine percentage of time spent in +front end, back end, retiring and speculation. These metrics are published and +verified by Intel. Further information: +Webpage describing Top-Down Method and its usage in Intel vTune: +https://software.intel.com/en-us/vtune-amplifier-help-tuning-applications-using-a-top-down-microarchitecture-analysis-method +Paper by Yasin Ahmad: +https://sites.google.com/site/analysismethods/yasin-pubs/TopDown-Yasin-ISPASS14.pdf?attredirects=0 +Slides by Yasin Ahmad: +http://www.cs.technion.ac.il/~erangi/TMA_using_Linux_perf__Ahmad_Yasin.pdf +The performance group was originally published here: +http://perf.mvermeulen.com/2018/04/14/top-down-performance-counter-analysis-part-1-likwid/ diff --git a/collectors/likwid/groups/skylake/UOPS.txt b/collectors/likwid/groups/skylake/UOPS.txt new file mode 100644 index 0000000..c0a86f2 --- /dev/null +++ b/collectors/likwid/groups/skylake/UOPS.txt @@ -0,0 +1,29 @@ +SHORT UOPs execution info + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_ANY +PMC1 UOPS_EXECUTED_THREAD +PMC2 UOPS_RETIRED_ALL + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Issued UOPs PMC0 +Executed UOPs PMC1 +Retired UOPs PMC2 + +LONG +Formulas: +Issued UOPs = UOPS_ISSUED_ANY +Executed UOPs = UOPS_EXECUTED_THREAD +Retired UOPs = UOPS_RETIRED_ALL +- +This group returns information about the instruction pipeline. It measures the +issued, executed and retired uOPs. diff --git a/collectors/likwid/groups/skylake/UOPS_EXEC.txt b/collectors/likwid/groups/skylake/UOPS_EXEC.txt new file mode 100644 index 0000000..7042df7 --- /dev/null +++ b/collectors/likwid/groups/skylake/UOPS_EXEC.txt @@ -0,0 +1,31 @@ +SHORT UOPs execution + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_EXECUTED_USED_CYCLES +PMC1 UOPS_EXECUTED_STALL_CYCLES +PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES +PMC3:EDGEDETECT UOPS_EXECUTED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio [%] 100*PMC0/PMC2 +Unused cycles ratio [%] 100*PMC1/PMC2 +Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT + + +LONG +Formulas: +Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT +- +This performance group returns the ratios of used and unused cycles regarding +the execution stage in the pipeline. Used cycles are all cycles where uOPs are +executed while unused cycles refer to pipeline stalls. Moreover, the group +calculates the average stall duration in cycles. diff --git a/collectors/likwid/groups/skylake/UOPS_ISSUE.txt b/collectors/likwid/groups/skylake/UOPS_ISSUE.txt new file mode 100644 index 0000000..9aac923 --- /dev/null +++ b/collectors/likwid/groups/skylake/UOPS_ISSUE.txt @@ -0,0 +1,31 @@ +SHORT UOPs issueing + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_USED_CYCLES +PMC1 UOPS_ISSUED_STALL_CYCLES +PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES +PMC3:EDGEDETECT UOPS_ISSUED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio [%] 100*PMC0/PMC2 +Unused cycles ratio [%] 100*PMC1/PMC2 +Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT + + +LONG +Formulas: +Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT +- +This performance group returns the ratios of used and unused cycles regarding +the issue stage in the pipeline. Used cycles are all cycles where uOPs are +issued while unused cycles refer to pipeline stalls. Moreover, the group +calculates the average stall duration in cycles. diff --git a/collectors/likwid/groups/skylake/UOPS_RETIRE.txt b/collectors/likwid/groups/skylake/UOPS_RETIRE.txt new file mode 100644 index 0000000..0f37585 --- /dev/null +++ b/collectors/likwid/groups/skylake/UOPS_RETIRE.txt @@ -0,0 +1,31 @@ +SHORT UOPs retirement + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_RETIRED_USED_CYCLES +PMC1 UOPS_RETIRED_STALL_CYCLES +PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES +PMC3:EDGEDETECT UOPS_RETIRED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio [%] 100*PMC0/PMC2 +Unused cycles ratio [%] 100*PMC1/PMC2 +Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT + + +LONG +Formulas: +Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT +- +This performance group returns the ratios of used and unused cycles regarding +the retirement stage in the pipeline (re-order buffer). Used cycles are all +cycles where uOPs are retired while unused cycles refer to pipeline stalls. +Moreover, the group calculates the average stall duration in cycles. diff --git a/collectors/likwid/groups/skylakeX/BRANCH.txt b/collectors/likwid/groups/skylakeX/BRANCH.txt new file mode 100644 index 0000000..b8d41b2 --- /dev/null +++ b/collectors/likwid/groups/skylakeX/BRANCH.txt @@ -0,0 +1,31 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 BR_INST_RETIRED_ALL_BRANCHES +PMC1 BR_MISP_RETIRED_ALL_BRANCHES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Branch rate PMC0/FIXC0 +Branch misprediction rate PMC1/FIXC0 +Branch misprediction ratio PMC1/PMC0 +Instructions per branch FIXC0/PMC0 + +LONG +Formulas: +Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES +Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/skylakeX/CACHES.txt b/collectors/likwid/groups/skylakeX/CACHES.txt new file mode 100644 index 0000000..c700dd4 --- /dev/null +++ b/collectors/likwid/groups/skylakeX/CACHES.txt @@ -0,0 +1,143 @@ +SHORT Cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPLACEMENT +PMC1 L1D_M_EVICT +PMC2 L2_LINES_IN_ALL +PMC3 L2_TRANS_L2_WB +CBOX0C1 LLC_VICTIMS_M_STATE +CBOX1C1 LLC_VICTIMS_M_STATE +CBOX2C1 LLC_VICTIMS_M_STATE +CBOX3C1 LLC_VICTIMS_M_STATE +CBOX4C1 LLC_VICTIMS_M_STATE +CBOX5C1 LLC_VICTIMS_M_STATE +CBOX6C1 LLC_VICTIMS_M_STATE +CBOX7C1 LLC_VICTIMS_M_STATE +CBOX8C1 LLC_VICTIMS_M_STATE +CBOX9C1 LLC_VICTIMS_M_STATE +CBOX10C1 LLC_VICTIMS_M_STATE +CBOX11C1 LLC_VICTIMS_M_STATE +CBOX12C1 LLC_VICTIMS_M_STATE +CBOX13C1 LLC_VICTIMS_M_STATE +CBOX14C1 LLC_VICTIMS_M_STATE +CBOX15C1 LLC_VICTIMS_M_STATE +CBOX16C1 LLC_VICTIMS_M_STATE +CBOX17C1 LLC_VICTIMS_M_STATE +CBOX18C1 LLC_VICTIMS_M_STATE +CBOX19C1 LLC_VICTIMS_M_STATE +CBOX20C1 LLC_VICTIMS_M_STATE +CBOX21C1 LLC_VICTIMS_M_STATE +CBOX22C1 LLC_VICTIMS_M_STATE +CBOX23C1 LLC_VICTIMS_M_STATE +CBOX24C1 LLC_VICTIMS_M_STATE +CBOX25C1 LLC_VICTIMS_M_STATE +CBOX26C1 LLC_VICTIMS_M_STATE +CBOX27C1 LLC_VICTIMS_M_STATE +CBOX0C0 LLC_LOOKUP_DATA_READ +CBOX1C0 LLC_LOOKUP_DATA_READ +CBOX2C0 LLC_LOOKUP_DATA_READ +CBOX3C0 LLC_LOOKUP_DATA_READ +CBOX4C0 LLC_LOOKUP_DATA_READ +CBOX5C0 LLC_LOOKUP_DATA_READ +CBOX6C0 LLC_LOOKUP_DATA_READ +CBOX7C0 LLC_LOOKUP_DATA_READ +CBOX8C0 LLC_LOOKUP_DATA_READ +CBOX9C0 LLC_LOOKUP_DATA_READ +CBOX10C0 LLC_LOOKUP_DATA_READ +CBOX11C0 LLC_LOOKUP_DATA_READ +CBOX12C0 LLC_LOOKUP_DATA_READ +CBOX13C0 LLC_LOOKUP_DATA_READ +CBOX14C0 LLC_LOOKUP_DATA_READ +CBOX15C0 LLC_LOOKUP_DATA_READ +CBOX16C0 LLC_LOOKUP_DATA_READ +CBOX17C0 LLC_LOOKUP_DATA_READ +CBOX18C0 LLC_LOOKUP_DATA_READ +CBOX19C0 LLC_LOOKUP_DATA_READ +CBOX20C0 LLC_LOOKUP_DATA_READ +CBOX21C0 LLC_LOOKUP_DATA_READ +CBOX22C0 LLC_LOOKUP_DATA_READ +CBOX23C0 LLC_LOOKUP_DATA_READ +CBOX24C0 LLC_LOOKUP_DATA_READ +CBOX25C0 LLC_LOOKUP_DATA_READ +CBOX26C0 LLC_LOOKUP_DATA_READ +CBOX27C0 LLC_LOOKUP_DATA_READ +MBOX0C0 CAS_COUNT_RD +MBOX0C1 CAS_COUNT_WR +MBOX1C0 CAS_COUNT_RD +MBOX1C1 CAS_COUNT_WR +MBOX2C0 CAS_COUNT_RD +MBOX2C1 CAS_COUNT_WR +MBOX3C0 CAS_COUNT_RD +MBOX3C1 CAS_COUNT_WR +MBOX4C0 CAS_COUNT_RD +MBOX4C1 CAS_COUNT_WR +MBOX5C0 CAS_COUNT_RD +MBOX5C1 CAS_COUNT_WR + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0 +L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 +L3 to L2 load bandwidth [MBytes/s] 1.0E-06*PMC2*64.0/time +L3 to L2 load data volume [GBytes] 1.0E-09*PMC2*64.0 +L2 to L3 evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time +L2 to L3 evict data volume [GBytes] 1.0E-09*PMC3*64.0 +L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time +L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0 +System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0)*64.0/time +System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0)*64.0 +L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64/time +L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64 +L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64.0/time +L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64.0 +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 + +LONG +Formulas: +L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time +L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64 +L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time +L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64 +L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time +L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64 +L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time +L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64 +L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time +L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64 +L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time +L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64 +System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ))*64/time +System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ))*64 +L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M_STATE))*64/time +L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M_STATE))*64 +L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M_STATE))*64/time +L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M_STATE))*64 +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time +Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time +Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time +Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0 +- +Group to measure cache transfers between L1 and Memory. Please notice that the +L3 to/from system metrics contain any traffic to the system (memory, +Intel QPI, etc.) but don't seem to handle anything because commonly memory read +bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth. + diff --git a/collectors/likwid/groups/skylakeX/CLOCK.txt b/collectors/likwid/groups/skylakeX/CLOCK.txt new file mode 100644 index 0000000..b81bee6 --- /dev/null +++ b/collectors/likwid/groups/skylakeX/CLOCK.txt @@ -0,0 +1,26 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY +UBOXFIX UNCORE_CLOCK + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +Uncore Clock [MHz] 1.E-06*UBOXFIX/time +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time +- +Broadwell implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) level. + diff --git a/collectors/likwid/groups/skylakeX/CYCLE_ACTIVITY.txt b/collectors/likwid/groups/skylakeX/CYCLE_ACTIVITY.txt new file mode 100644 index 0000000..c432a44 --- /dev/null +++ b/collectors/likwid/groups/skylakeX/CYCLE_ACTIVITY.txt @@ -0,0 +1,38 @@ +SHORT Cycle Activities + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING +PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING +PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING +PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Cycles without execution [%] (PMC3/FIXC1)*100 +Cycles without execution due to L1D [%] (PMC2/FIXC1)*100 +Cycles without execution due to L2 [%] (PMC0/FIXC1)*100 +Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100 + +LONG +Formulas: +Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100 +Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100 +Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100 +Cycles without execution due to memory loads [%] = CYCLE_ACTIVITY_CYCLES_LDM_PENDING/CPU_CLK_UNHALTED_CORE*100 +-- +This performance group measures the cycles while waiting for data from the cache +and memory hierarchy. +CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on +any execution port. +CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is +outstanding. +CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is +outstanding. +CYCLE_ACTIVITY_CYCLES_LDM_PENDING: Cycles while memory subsystem has an +outstanding load. diff --git a/collectors/likwid/groups/skylakeX/CYCLE_STALLS.txt b/collectors/likwid/groups/skylakeX/CYCLE_STALLS.txt new file mode 100644 index 0000000..795aeb9 --- /dev/null +++ b/collectors/likwid/groups/skylakeX/CYCLE_STALLS.txt @@ -0,0 +1,45 @@ +SHORT Cycle Activities (Stalls) + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING +PMC1 CYCLE_ACTIVITY_STALLS_LDM_PENDING +PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING +PMC3 CYCLE_ACTIVITY_STALLS_TOTAL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Total execution stalls PMC3 +Stalls caused by L1D misses [%] (PMC2/PMC3)*100 +Stalls caused by L2 misses [%] (PMC0/PMC3)*100 +Stalls caused by memory loads [%] (PMC1/PMC3)*100 +Execution stall rate [%] (PMC3/FIXC1)*100 +Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100 +Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100 +Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100 + +LONG +Formulas: +Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL +Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Stalls caused by memory loads [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 +Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100 +Stalls caused by memory loads rate [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CPU_CLK_UNHALTED_CORE)*100 +-- +This performance group measures the stalls caused by data traffic in the cache +hierarchy. +CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls. +CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand +load is outstanding. +CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand +load is outstanding. +CYCLE_ACTIVITY_STALLS_LDM_PENDING: Execution stalls while memory subsystem has +an outstanding load. diff --git a/collectors/likwid/groups/skylakeX/DATA.txt b/collectors/likwid/groups/skylakeX/DATA.txt new file mode 100644 index 0000000..4e6e938 --- /dev/null +++ b/collectors/likwid/groups/skylakeX/DATA.txt @@ -0,0 +1,22 @@ +SHORT Load to store ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_INST_RETIRED_ALL_LOADS +PMC1 MEM_INST_RETIRED_ALL_STORES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Load to store ratio PMC0/PMC1 + +LONG +Formulas: +Load to store ratio = MEM_INST_RETIRED_ALL_LOADS/MEM_INST_RETIRED_ALL_STORES +- +This is a metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/skylakeX/DIVIDE.txt b/collectors/likwid/groups/skylakeX/DIVIDE.txt new file mode 100644 index 0000000..40b4ab6 --- /dev/null +++ b/collectors/likwid/groups/skylakeX/DIVIDE.txt @@ -0,0 +1,24 @@ +SHORT Divide unit information + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ARITH_DIVIDER_COUNT +PMC1 ARITH_DIVIDER_ACTIVE + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Number of divide ops PMC0 +Avg. divide unit usage duration PMC1/PMC0 + +LONG +Formulas: +Number of divide ops = ARITH_DIVIDER_COUNT +Avg. divide unit usage duration = ARITH_DIVIDER_ACTIVE/ARITH_DIVIDER_COUNT +- +This performance group measures the average latency of divide operations diff --git a/collectors/likwid/groups/skylakeX/ENERGY.txt b/collectors/likwid/groups/skylakeX/ENERGY.txt new file mode 100644 index 0000000..fe7829f --- /dev/null +++ b/collectors/likwid/groups/skylakeX/ENERGY.txt @@ -0,0 +1,35 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +TMP0 TEMP_CORE +PWR0 PWR_PKG_ENERGY +PWR1 PWR_PP0_ENERGY +PWR3 PWR_DRAM_ENERGY + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Temperature [C] TMP0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy PP0 [J] PWR1 +Power PP0 [W] PWR1/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time + +LONG +Formulas: +Power = PWR_PKG_ENERGY / time +Power PP0 = PWR_PP0_ENERGY / time +Power DRAM = PWR_DRAM_ENERGY / time +- +Broadwell implements the new RAPL interface. This interface enables to +monitor the consumed energy on the package (socket) and DRAM level. + diff --git a/collectors/likwid/groups/skylakeX/FLOPS_AVX.txt b/collectors/likwid/groups/skylakeX/FLOPS_AVX.txt new file mode 100644 index 0000000..e44a913 --- /dev/null +++ b/collectors/likwid/groups/skylakeX/FLOPS_AVX.txt @@ -0,0 +1,25 @@ +SHORT Packed AVX MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE +PMC1 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE +PMC2 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE +PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Packed SP [MFLOP/s] 1.0E-06*(PMC0*8.0+PMC2*16.0)/time +Packed DP [MFLOP/s] 1.0E-06*(PMC1*4.0+PMC3*8.0)/time + +LONG +Formulas: +Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime +Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime +- +Packed 32b AVX FLOPs rates. diff --git a/collectors/likwid/groups/skylakeX/FLOPS_DP.txt b/collectors/likwid/groups/skylakeX/FLOPS_DP.txt new file mode 100644 index 0000000..177cff2 --- /dev/null +++ b/collectors/likwid/groups/skylakeX/FLOPS_DP.txt @@ -0,0 +1,34 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE +PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time +AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time +AVX512 DP [MFLOP/s] 1.0E-06*(PMC3*8.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3) + +LONG +Formulas: +DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime +AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime +AVX512 DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime +Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE) +- +SSE scalar and packed double precision FLOP rates. + diff --git a/collectors/likwid/groups/skylakeX/FLOPS_SP.txt b/collectors/likwid/groups/skylakeX/FLOPS_SP.txt new file mode 100644 index 0000000..01d98c2 --- /dev/null +++ b/collectors/likwid/groups/skylakeX/FLOPS_SP.txt @@ -0,0 +1,34 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE +PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time +AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time +AVX512 SP [MFLOP/s] 1.0E-06*(PMC3*16.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3) + +LONG +Formulas: +SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime +AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime +AVX512 SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime +Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE) +- +SSE scalar and packed single precision FLOP rates. + diff --git a/collectors/likwid/groups/skylakeX/L2.txt b/collectors/likwid/groups/skylakeX/L2.txt new file mode 100644 index 0000000..1a92a95 --- /dev/null +++ b/collectors/likwid/groups/skylakeX/L2.txt @@ -0,0 +1,38 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPLACEMENT +PMC1 L1D_M_EVICT +PMC2 ICACHE_64B_IFTAG_MISS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2D load data volume [GBytes] 1.0E-09*PMC0*64.0 +L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time +L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0 + +LONG +Formulas: +L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time +L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0 +L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time +L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0 +L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_64B_IFTAG_MISS)*64/time +L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_64B_IFTAG_MISS)*64 +- +Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L1 and the number of modified cache lines +evicted from the L1. The group also output total data volume transferred between +L2 and L1. Note that this bandwidth also includes data transfers due to a write +allocate load on a store miss in L1 and traffic caused by misses in the +L1 instruction cache. + diff --git a/collectors/likwid/groups/skylakeX/L2CACHE.txt b/collectors/likwid/groups/skylakeX/L2CACHE.txt new file mode 100644 index 0000000..9b5dd4b --- /dev/null +++ b/collectors/likwid/groups/skylakeX/L2CACHE.txt @@ -0,0 +1,34 @@ +SHORT L2 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_TRANS_ALL_REQUESTS +PMC1 L2_RQSTS_MISS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 request rate PMC0/FIXC0 +L2 miss rate PMC1/FIXC0 +L2 miss ratio PMC1/PMC0 + +LONG +Formulas: +L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY +L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY +L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS +- +This group measures the locality of your data accesses with regard to the +L2 cache. L2 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L2 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L2 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the# data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/skylakeX/L3.txt b/collectors/likwid/groups/skylakeX/L3.txt new file mode 100644 index 0000000..219f932 --- /dev/null +++ b/collectors/likwid/groups/skylakeX/L3.txt @@ -0,0 +1,48 @@ +SHORT L3 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_LINES_IN_ALL +PMC1 L2_TRANS_L2_WB +PMC2 IDI_MISC_WB_DOWNGRADE +PMC3 IDI_MISC_WB_UPGRADE + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L3 load data volume [GBytes] 1.0E-09*PMC0*64.0 +L3 evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time +L3 evict data volume [GBytes] 1.0E-09*PMC3*64.0 +L3|MEM evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L3|MEM evict data volume [GBytes] 1.0E-09*PMC1*64.0 +Dropped CLs bandwidth [MBytes/s] 1.0E-6*PMC2*64.0/time +Dropped CLs data volume [GBytes] 1.0E-9*PMC2*64.0 +L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 + +LONG +Formulas: +L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time +L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0 +L3 evict bandwidth [MBytes/s] = 1.0E-06*IDI_MISC_WB_UPGRADE*64.0/time +L3 evict data volume [GBytes] = 1.0E-09*IDI_MISC_WB_UPGRADE*64.0 +Dropped CLs bandwidth [MBytes/s] = 1.0E-6*IDI_MISC_WB_DOWNGRADE*64.0/time +Dropped CLs data volume [GBytes] = 1.0E-9*IDI_MISC_WB_DOWNGRADE*64.0 +L3|MEM evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time +L3|MEM evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0 +L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time +L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64 +-- +Profiling group to measure L3 cache bandwidth and data volume. For Intel Skylake +or Cascadelake, the L3 is a victim cache. This means that all data is loaded +from memory directly into the L2 cache (if L3 prefetcher is inactive). Modified +data in L2 is evicted to L3 (additional data transfer due to non-inclusivenss of +L3 can be measured). Clean cache lines (only loaded data) might get dropped in +L2 to reduce traffic. If amount of clean cache lines is smaller than L3, it +might be evicted to L3 due to some heuristic. diff --git a/collectors/likwid/groups/skylakeX/L3CACHE.txt b/collectors/likwid/groups/skylakeX/L3CACHE.txt new file mode 100644 index 0000000..bc664d1 --- /dev/null +++ b/collectors/likwid/groups/skylakeX/L3CACHE.txt @@ -0,0 +1,35 @@ +SHORT L3 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_LOAD_RETIRED_L3_HIT +PMC1 MEM_LOAD_RETIRED_L3_MISS +PMC2 UOPS_RETIRED_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 request rate (PMC0+PMC1)/PMC2 +L3 miss rate PMC1/PMC2 +L3 miss ratio PMC1/(PMC0+PMC1) + +LONG +Formulas: +L3 request rate = (MEM_LOAD_RETIRED_L3_HIT+MEM_LOAD_RETIRED_L3_MISS)/UOPS_RETIRED_ALL +L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL +L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/(MEM_LOAD_RETIRED_L3_HIT+MEM_LOAD_RETIRED_L3_MISS) +- +This group measures the locality of your data accesses with regard to the +L3 cache. L3 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L3 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L3 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/skylakeX/MEM.txt b/collectors/likwid/groups/skylakeX/MEM.txt new file mode 100644 index 0000000..3d50ecb --- /dev/null +++ b/collectors/likwid/groups/skylakeX/MEM.txt @@ -0,0 +1,48 @@ +SHORT Main memory bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +MBOX0C0 CAS_COUNT_RD +MBOX0C1 CAS_COUNT_WR +MBOX1C0 CAS_COUNT_RD +MBOX1C1 CAS_COUNT_WR +MBOX2C0 CAS_COUNT_RD +MBOX2C1 CAS_COUNT_WR +MBOX3C0 CAS_COUNT_RD +MBOX3C1 CAS_COUNT_WR +MBOX4C0 CAS_COUNT_RD +MBOX4C1 CAS_COUNT_WR +MBOX5C0 CAS_COUNT_RD +MBOX5C1 CAS_COUNT_WR + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 + +LONG +Formulas: +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime +Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime +Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime +Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0 +- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Since this group is based on Uncore events it is only possible to measure on a +per socket base. Some of the counters may not be available on your system. +Also outputs total data volume transferred from main memory. +The same metrics are provided by the HA group. + diff --git a/collectors/likwid/groups/skylakeX/MEM_DP.txt b/collectors/likwid/groups/skylakeX/MEM_DP.txt new file mode 100644 index 0000000..d6e481a --- /dev/null +++ b/collectors/likwid/groups/skylakeX/MEM_DP.txt @@ -0,0 +1,70 @@ +SHORT Overview of arithmetic and main memory performance + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY +PWR3 PWR_DRAM_ENERGY +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE +PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE +MBOX0C0 CAS_COUNT_RD +MBOX0C1 CAS_COUNT_WR +MBOX1C0 CAS_COUNT_RD +MBOX1C1 CAS_COUNT_WR +MBOX2C0 CAS_COUNT_RD +MBOX2C1 CAS_COUNT_WR +MBOX3C0 CAS_COUNT_RD +MBOX3C1 CAS_COUNT_WR +MBOX4C0 CAS_COUNT_RD +MBOX4C1 CAS_COUNT_WR +MBOX5C0 CAS_COUNT_RD +MBOX5C1 CAS_COUNT_WR + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time +DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time +AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 +Operational intensity (PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0) + +LONG +Formulas: +Power [W] = PWR_PKG_ENERGY/runtime +Power DRAM [W] = PWR_DRAM_ENERGY/runtime +DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime +AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/runtime +Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/runtime +Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/runtime +Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0 +Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0) +-- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Since this group is based on Uncore events it is only possible to measure on +a per socket base. Also outputs total data volume transferred from main memory. +SSE scalar and packed double precision FLOP rates. Also reports on packed AVX +32b instructions. +The operational intensity is calculated using the FP values of the cores and the +memory data volume of the whole socket. The actual operational intensity for +multiple CPUs can be found in the statistics table in the Sum column. diff --git a/collectors/likwid/groups/skylakeX/MEM_SP.txt b/collectors/likwid/groups/skylakeX/MEM_SP.txt new file mode 100644 index 0000000..5720938 --- /dev/null +++ b/collectors/likwid/groups/skylakeX/MEM_SP.txt @@ -0,0 +1,70 @@ +SHORT Overview of arithmetic and main memory performance + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PWR0 PWR_PKG_ENERGY +PWR3 PWR_DRAM_ENERGY +PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE +PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE +PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE +PMC3 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE +MBOX0C0 CAS_COUNT_RD +MBOX0C1 CAS_COUNT_WR +MBOX1C0 CAS_COUNT_RD +MBOX1C1 CAS_COUNT_WR +MBOX2C0 CAS_COUNT_RD +MBOX2C1 CAS_COUNT_WR +MBOX3C0 CAS_COUNT_RD +MBOX3C1 CAS_COUNT_WR +MBOX4C0 CAS_COUNT_RD +MBOX4C1 CAS_COUNT_WR +MBOX5C0 CAS_COUNT_RD +MBOX5C1 CAS_COUNT_WR + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time +SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time +AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 +Operational intensity (PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0) + +LONG +Formulas: +Power [W] = PWR_PKG_ENERGY/runtime +Power DRAM [W] = PWR_DRAM_ENERGY/runtime +SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime +AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/runtime +Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/runtime +Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/runtime +Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0 +Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0) +-- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Since this group is based on Uncore events it is only possible to measure on +a per socket base. Also outputs total data volume transferred from main memory. +SSE scalar and packed single precision FLOP rates. Also reports on packed AVX +32b instructions. +The operational intensity is calculated using the FP values of the cores and the +memory data volume of the whole socket. The actual operational intensity for +multiple CPUs can be found in the statistics table in the Sum column. diff --git a/collectors/likwid/groups/skylakeX/TLB_DATA.txt b/collectors/likwid/groups/skylakeX/TLB_DATA.txt new file mode 100644 index 0000000..10ee5e1 --- /dev/null +++ b/collectors/likwid/groups/skylakeX/TLB_DATA.txt @@ -0,0 +1,35 @@ +SHORT L2 data TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 DTLB_LOAD_MISSES_CAUSES_A_WALK +PMC1 DTLB_STORE_MISSES_CAUSES_A_WALK +PMC2 DTLB_LOAD_MISSES_WALK_ACTIVE +PMC3 DTLB_STORE_MISSES_WALK_ACTIVE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 DTLB load misses PMC0 +L1 DTLB load miss rate PMC0/FIXC0 +L1 DTLB load miss duration [Cyc] PMC2/PMC0 +L1 DTLB store misses PMC1 +L1 DTLB store miss rate PMC1/FIXC0 +L1 DTLB store miss duration [Cyc] PMC3/PMC1 + +LONG +Formulas: +L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK +L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_ACTIVE / DTLB_LOAD_MISSES_CAUSES_A_WALK +L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK +L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_ACTIVE / DTLB_STORE_MISSES_CAUSES_A_WALK +- +The DTLB load and store miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/skylakeX/TLB_INSTR.txt b/collectors/likwid/groups/skylakeX/TLB_INSTR.txt new file mode 100644 index 0000000..9bc65a7 --- /dev/null +++ b/collectors/likwid/groups/skylakeX/TLB_INSTR.txt @@ -0,0 +1,28 @@ +SHORT L1 Instruction TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ITLB_MISSES_CAUSES_A_WALK +PMC1 ITLB_MISSES_WALK_ACTIVE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 ITLB misses PMC0 +L1 ITLB miss rate PMC0/FIXC0 +L1 ITLB miss duration [Cyc] PMC1/PMC0 + + +LONG +Formulas: +L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK +L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY +L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_ACTIVE / ITLB_MISSES_CAUSES_A_WALK +- +The ITLB miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/skylakeX/TMA.txt b/collectors/likwid/groups/skylakeX/TMA.txt new file mode 100644 index 0000000..afb4126 --- /dev/null +++ b/collectors/likwid/groups/skylakeX/TMA.txt @@ -0,0 +1,48 @@ +SHORT Top down cycle allocation + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_ANY +PMC1 UOPS_RETIRED_RETIRE_SLOTS +PMC2 IDQ_UOPS_NOT_DELIVERED_CORE +PMC3 INT_MISC_RECOVERY_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +IPC FIXC0/FIXC1 +Total Slots 4*FIXC1 +Slots Retired PMC1 +Fetch Bubbles PMC2 +Recovery Bubbles 4*PMC3 +Front End [%] PMC2/(4*FIXC1)*100 +Speculation [%] (PMC0-PMC1+(4*PMC3))/(4*FIXC1)*100 +Retiring [%] PMC1/(4*FIXC1)*100 +Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100 + +LONG +Formulas: +Total Slots = 4*CPU_CLK_UNHALTED_CORE +Slots Retired = UOPS_RETIRED_RETIRE_SLOTS +Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE +Recovery Bubbles = 4*INT_MISC_RECOVERY_CYCLES +Front End [%] = IDQ_UOPS_NOT_DELIVERED_CORE/(4*CPU_CLK_UNHALTED_CORE)*100 +Speculation [%] = (UOPS_ISSUED_ANY-UOPS_RETIRED_RETIRE_SLOTS+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)*100 +Retiring [%] = UOPS_RETIRED_RETIRE_SLOTS/(4*CPU_CLK_UNHALTED_CORE)*100 +Back End [%] = (1-((IDQ_UOPS_NOT_DELIVERED_CORE+UOPS_ISSUED_ANY+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)))*100 +-- +This performance group measures cycles to determine percentage of time spent in +front end, back end, retiring and speculation. These metrics are published and +verified by Intel. Further information: +Webpage describing Top-Down Method and its usage in Intel vTune: +https://software.intel.com/en-us/vtune-amplifier-help-tuning-applications-using-a-top-down-microarchitecture-analysis-method +Paper by Yasin Ahmad: +https://sites.google.com/site/analysismethods/yasin-pubs/TopDown-Yasin-ISPASS14.pdf?attredirects=0 +Slides by Yasin Ahmad: +http://www.cs.technion.ac.il/~erangi/TMA_using_Linux_perf__Ahmad_Yasin.pdf +The performance group was originally published here: +http://perf.mvermeulen.com/2018/04/14/top-down-performance-counter-analysis-part-1-likwid/ diff --git a/collectors/likwid/groups/skylakeX/UOPS_EXEC.txt b/collectors/likwid/groups/skylakeX/UOPS_EXEC.txt new file mode 100644 index 0000000..7042df7 --- /dev/null +++ b/collectors/likwid/groups/skylakeX/UOPS_EXEC.txt @@ -0,0 +1,31 @@ +SHORT UOPs execution + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_EXECUTED_USED_CYCLES +PMC1 UOPS_EXECUTED_STALL_CYCLES +PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES +PMC3:EDGEDETECT UOPS_EXECUTED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio [%] 100*PMC0/PMC2 +Unused cycles ratio [%] 100*PMC1/PMC2 +Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT + + +LONG +Formulas: +Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT +- +This performance group returns the ratios of used and unused cycles regarding +the execution stage in the pipeline. Used cycles are all cycles where uOPs are +executed while unused cycles refer to pipeline stalls. Moreover, the group +calculates the average stall duration in cycles. diff --git a/collectors/likwid/groups/skylakeX/UOPS_ISSUE.txt b/collectors/likwid/groups/skylakeX/UOPS_ISSUE.txt new file mode 100644 index 0000000..9aac923 --- /dev/null +++ b/collectors/likwid/groups/skylakeX/UOPS_ISSUE.txt @@ -0,0 +1,31 @@ +SHORT UOPs issueing + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_USED_CYCLES +PMC1 UOPS_ISSUED_STALL_CYCLES +PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES +PMC3:EDGEDETECT UOPS_ISSUED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio [%] 100*PMC0/PMC2 +Unused cycles ratio [%] 100*PMC1/PMC2 +Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT + + +LONG +Formulas: +Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT +- +This performance group returns the ratios of used and unused cycles regarding +the issue stage in the pipeline. Used cycles are all cycles where uOPs are +issued while unused cycles refer to pipeline stalls. Moreover, the group +calculates the average stall duration in cycles. diff --git a/collectors/likwid/groups/skylakeX/UOPS_RETIRE.txt b/collectors/likwid/groups/skylakeX/UOPS_RETIRE.txt new file mode 100644 index 0000000..0f37585 --- /dev/null +++ b/collectors/likwid/groups/skylakeX/UOPS_RETIRE.txt @@ -0,0 +1,31 @@ +SHORT UOPs retirement + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_RETIRED_USED_CYCLES +PMC1 UOPS_RETIRED_STALL_CYCLES +PMC2 CPU_CLOCK_UNHALTED_TOTAL_CYCLES +PMC3:EDGEDETECT UOPS_RETIRED_STALL_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Used cycles ratio [%] 100*PMC0/PMC2 +Unused cycles ratio [%] 100*PMC1/PMC2 +Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT + + +LONG +Formulas: +Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE +Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE +Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT +- +This performance group returns the ratios of used and unused cycles regarding +the retirement stage in the pipeline (re-order buffer). Used cycles are all +cycles where uOPs are retired while unused cycles refer to pipeline stalls. +Moreover, the group calculates the average stall duration in cycles. diff --git a/collectors/likwid/groups/skylakeX/UPI.txt b/collectors/likwid/groups/skylakeX/UPI.txt new file mode 100644 index 0000000..2a4c44f --- /dev/null +++ b/collectors/likwid/groups/skylakeX/UPI.txt @@ -0,0 +1,42 @@ +SHORT UPI data traffic + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +SBOX0C0 TXL_FLITS_ALL_DATA +SBOX0C1 RXL_FLITS_ALL_DATA +SBOX1C0 TXL_FLITS_ALL_DATA +SBOX1C1 RXL_FLITS_ALL_DATA +SBOX2C0 TXL_FLITS_ALL_DATA +SBOX2C1 RXL_FLITS_ALL_DATA + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Received data bandwidth [MByte/s] 1.0E-06*((SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0/time +Received data volume [GByte] 1.0E-09*((SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0 +Sent data bandwidth [MByte/s] 1.0E-06*((SBOX0C0+SBOX1C0+SBOX2C0)/9.0)*64.0/time +Sent data volume [GByte] 1.0E-09*((SBOX0C0+SBOX1C0+SBOX2C0)/9.0)*64.0 +Total data bandwidth [MByte/s] 1.0E-06*((SBOX0C0+SBOX1C0+SBOX2C0+SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0/time +Total data volume [GByte] 1.0E-09*((SBOX0C0+SBOX1C0+SBOX2C0+SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0 + + +LONG +Formulas: +Received data bandwidth [MByte/s] = 1.0E-06*(SUM(RXL_FLITS_ALL_DATA)/9.0)*64.0/runtime +Received data volume [GByte] = 1.0E-09*(SUM(RXL_FLITS_ALL_DATA)/9.0)*64.0 +Sent data bandwidth [MByte/s] = 1.0E-06*(SUM(TXL_FLITS_ALL_DATA)/9.0)*64.0/time +Sent data volume [GByte] = 1.0E-09*(SUM(TXL_FLITS_ALL_DATA)/9.0)*64.0 +Total data bandwidth [MByte/s] = 1.0E-06*((SUM(RXL_FLITS_ALL_DATA)+SUM(TXL_FLITS_ALL_DATA))/9.0)*64.0/time +Total data volume [GByte] = 1.0E-09*((SUM(RXL_FLITS_ALL_DATA)+SUM(TXL_FLITS_ALL_DATA))/9.0)*64.0 +-- +This group measures the data traffic on the UPI (socket interconnect). The group +measures all filled data slots (9 slots per 64 Byte data transfer), that's why +the count needs to be divided by 9. These 9 data chunks are not transferred in +a single flit but there is one flit for the header and three flits for the data. +The metrics show higher values as expected because the events count also +different transfers which include data. diff --git a/collectors/likwid/groups/westmere/BRANCH.txt b/collectors/likwid/groups/westmere/BRANCH.txt new file mode 100644 index 0000000..b8d41b2 --- /dev/null +++ b/collectors/likwid/groups/westmere/BRANCH.txt @@ -0,0 +1,31 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 BR_INST_RETIRED_ALL_BRANCHES +PMC1 BR_MISP_RETIRED_ALL_BRANCHES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Branch rate PMC0/FIXC0 +Branch misprediction rate PMC1/FIXC0 +Branch misprediction ratio PMC1/PMC0 +Instructions per branch FIXC0/PMC0 + +LONG +Formulas: +Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES +Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/westmere/CACHE.txt b/collectors/likwid/groups/westmere/CACHE.txt new file mode 100644 index 0000000..6a5e4fe --- /dev/null +++ b/collectors/likwid/groups/westmere/CACHE.txt @@ -0,0 +1,26 @@ +SHORT Data cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +data cache misses PMC0 +data cache miss rate PMC0/FIXC0 + +LONG +Formulas: +data cache misses = L1D_REPL +data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY +- +This group measures the locality of your data accesses with regard to the +L1 cache. +The data cache miss rate gives a measure how often it was necessary to get +cache lines from higher levels of the memory hierarchy. + diff --git a/collectors/likwid/groups/westmere/CLOCK.txt b/collectors/likwid/groups/westmere/CLOCK.txt new file mode 100644 index 0000000..5f862a5 --- /dev/null +++ b/collectors/likwid/groups/westmere/CLOCK.txt @@ -0,0 +1,21 @@ +SHORT CPU clock information + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 + +LONG +Formulas: +Runtime (RDTSC) [s] = time +Runtime unhalted [s] = CPU_CLK_UNHALTED_CORE*inverseClock +Clock [MHz] = 1.E-06*(CPU_CLK_UNHALTED_CORE/CPU_CLK_UNHALTED_REF)/inverseClock +CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY +- +CPU clock information diff --git a/collectors/likwid/groups/westmere/DATA.txt b/collectors/likwid/groups/westmere/DATA.txt new file mode 100644 index 0000000..31bba51 --- /dev/null +++ b/collectors/likwid/groups/westmere/DATA.txt @@ -0,0 +1,22 @@ +SHORT Load to store ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_INST_RETIRED_LOADS +PMC1 MEM_INST_RETIRED_STORES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Load to store ratio PMC0/PMC1 + +LONG +Formulas: +Load to store ratio = MEM_INST_RETIRED_LOADS/MEM_INST_RETIRED_STORES +- +This is a simple metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/westmere/DIVIDE.txt b/collectors/likwid/groups/westmere/DIVIDE.txt new file mode 100644 index 0000000..2677a19 --- /dev/null +++ b/collectors/likwid/groups/westmere/DIVIDE.txt @@ -0,0 +1,24 @@ +SHORT Divide unit information + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ARITH_NUM_DIV +PMC1 ARITH_CYCLES_DIV_BUSY + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Number of divide ops PMC0 +Avg. divide unit usage duration PMC1/PMC0 + +LONG +Formulas: +Number of divide ops = ARITH_NUM_DIV +Avg. divide unit usage duration = ARITH_CYCLES_DIV_BUSY/ARITH_NUM_DIV +- +This performance group measures the average latency of divide operations diff --git a/collectors/likwid/groups/westmere/FLOPS_DP.txt b/collectors/likwid/groups/westmere/FLOPS_DP.txt new file mode 100644 index 0000000..c5c8203 --- /dev/null +++ b/collectors/likwid/groups/westmere/FLOPS_DP.txt @@ -0,0 +1,35 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED +PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR +PMC2 FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION +PMC3 FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1)/time +Packed [MUOPS/s] 1.0E-06*PMC0/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +SP [MUOPS/s] 1.0E-06*PMC2/time +DP [MUOPS/s] 1.0E-06*PMC3/time + +LONG +Formulas: +DP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime +Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime +SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime +DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime +- +Westmere has no possibility to measure MFLOPs if mixed precision calculations are done. +Therefore both single as well as double precision are measured to ensure the correctness +of the measurements. You can check if your code was vectorized on the number of +FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR. + diff --git a/collectors/likwid/groups/westmere/FLOPS_SP.txt b/collectors/likwid/groups/westmere/FLOPS_SP.txt new file mode 100644 index 0000000..933b058 --- /dev/null +++ b/collectors/likwid/groups/westmere/FLOPS_SP.txt @@ -0,0 +1,35 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED +PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR +PMC2 FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION +PMC3 FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1)/time +Packed [MUOPS/s] 1.0E-06*PMC0/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +SP [MUOPS/s] 1.0E-06*PMC2/time +DP [MUOPS/s] 1.0E-06*PMC3/time + +LONG +Formulas: +SP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime +Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime +SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime +DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime +- +Westmere has no possibility to measure MFLOPs if mixed precision calculations are done. +Therefore both single as well as double precision are measured to ensure the correctness +of the measurements. You can check if your code was vectorized on the number of +FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR. + diff --git a/collectors/likwid/groups/westmere/FLOPS_X87.txt b/collectors/likwid/groups/westmere/FLOPS_X87.txt new file mode 100644 index 0000000..39cd8b4 --- /dev/null +++ b/collectors/likwid/groups/westmere/FLOPS_X87.txt @@ -0,0 +1,21 @@ +SHORT X87 MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 INST_RETIRED_X87 + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +X87 [MFLOP/s] 1.0E-06*PMC0/time + +LONG +Formulas: +X87 [MFLOP/s] = 1.0E-06*INST_RETIRED_X87/runtime +- +Profiling group to measure X87 FLOP rate. + diff --git a/collectors/likwid/groups/westmere/ICACHE.txt b/collectors/likwid/groups/westmere/ICACHE.txt new file mode 100644 index 0000000..49943ff --- /dev/null +++ b/collectors/likwid/groups/westmere/ICACHE.txt @@ -0,0 +1,25 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1I_READS +PMC1 L1I_MISSES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1I request rate PMC0/FIXC0 +L1I miss rate PMC1/FIXC0 +L1I miss ratio PMC1/PMC0 + +LONG +Formulas: +L1I request rate = L1I_READS / INSTR_RETIRED_ANY +L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY +L1I miss ratio = ICACHE_MISSES / L1I_READS +- +This group measures some L1 instruction cache metrics. diff --git a/collectors/likwid/groups/westmere/L2.txt b/collectors/likwid/groups/westmere/L2.txt new file mode 100644 index 0000000..74f7d58 --- /dev/null +++ b/collectors/likwid/groups/westmere/L2.txt @@ -0,0 +1,38 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPL +PMC1 L1D_M_EVICT +PMC2 L1I_MISSES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2D load data volume [GBytes] 1.0E-09*PMC0*64.0 +L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time +L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0 + +LONG +Formulas: +L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPL*64.0/time +L2D load data volume [GBytes] = 1.0E-09*L1D_REPL*64.0 +L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time +L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0 +L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64/time +L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64 +- +Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L1 and the number of modified cache lines +evicted from the L1. The group also reports of data volume transferred between +L2 and L1 cache. Note that this bandwidth also includes data transfers due to a +write allocate load on a store miss in L1 and traffic caused by misses in the +L1 instruction cache. + diff --git a/collectors/likwid/groups/westmere/L2CACHE.txt b/collectors/likwid/groups/westmere/L2CACHE.txt new file mode 100644 index 0000000..343b263 --- /dev/null +++ b/collectors/likwid/groups/westmere/L2CACHE.txt @@ -0,0 +1,34 @@ +SHORT L2 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_RQSTS_REFERENCES +PMC1 L2_RQSTS_MISS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 request rate PMC0/FIXC0 +L2 miss rate PMC1/FIXC0 +L2 miss ratio PMC1/PMC0 + +LONG +Formulas: +L2 request rate = L2_RQSTS_REFERENCES/INSTR_RETIRED_ANY +L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY +L2 miss ratio = L2_RQSTS_MISS/L2_RQSTS_REFERENCES +- +This group measures the locality of your data accesses with regard to the +L2 cache. L2 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L2 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L2 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/westmere/L3.txt b/collectors/likwid/groups/westmere/L3.txt new file mode 100644 index 0000000..a1d95e3 --- /dev/null +++ b/collectors/likwid/groups/westmere/L3.txt @@ -0,0 +1,37 @@ +SHORT L3 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_RQSTS_MISS +PMC1 L2_LINES_OUT_DIRTY_ANY + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L3 load data volume [GBytes] 1.0E-09*PMC0*64.0 +L3 evict bandwidth [MBytes/s] 1.0E-06*(PMC1)*64.0/time +L3 evict data volume [GBytes] 1.0E-09*(PMC1)*64.0 +L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 + +LONG +Formulas: +L3 load bandwidth [MBytes/s] = 1.0E-06*L2_RQSTS_MISS*64.0/time +L3 load data volume [GBytes] = 1.0E-09*L2_RQSTS_MISS*64.0 +L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DIRTY_ANY*64.0/time +L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DIRTY_ANY*64.0 +L3 bandwidth [MBytes/s] = 1.0E-06*(L2_RQSTS_MISS+L2_LINES_OUT_DIRTY_ANY)*64/time +L3 data volume [GBytes] = 1.0E-09*(L2_RQSTS_MISS+L2_LINES_OUT_DIRTY_ANY)*64 +- +Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L2 and the number of modified cache lines +evicted from the L2. The group also reports total data volume between L3 and +the measured L2 cache. Note that this bandwidth also includes data transfers +due to a write allocate load on a store miss in L2. + diff --git a/collectors/likwid/groups/westmere/L3CACHE.txt b/collectors/likwid/groups/westmere/L3CACHE.txt new file mode 100644 index 0000000..58072c1 --- /dev/null +++ b/collectors/likwid/groups/westmere/L3CACHE.txt @@ -0,0 +1,34 @@ +SHORT L3 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +UPMC0 UNC_L3_HITS_ANY +UPMC1 UNC_L3_MISS_ANY + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 request rate (UPMC0+UPMC1)/FIXC0 +L3 miss rate UPMC1/FIXC0 +L3 miss ratio UPMC1/(UPMC0+UPMC1) + +LONG +Formulas: +L3 request rate = (UNC_L3_HITS_ANY+UNC_L3_MISS_ANY)/INSTR_RETIRED_ANY +L3 miss rate = UNC_L3_MISS_ANY/INSTR_RETIRED_ANY +L3 miss ratio = UNC_L3_MISS_ANY/(UNC_L3_HITS_ANY+UNC_L3_MISS_ANY) +- +This group measures the locality of your data accesses with regard to the L3 +Cache. L3 request rate tells you how data intensive your code is or how many +data accesses you have on average per instruction. The L3 miss rate gives a +measure how often it was necessary to get cache lines from memory. And finally +L3 miss ratio tells you how many of your memory references required a cache line +to be loaded from a higher level. While the data cache miss rate might be given +by your algorithm you should try to get data cache miss ratio as low as +possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/westmere/MEM.txt b/collectors/likwid/groups/westmere/MEM.txt new file mode 100644 index 0000000..b5165e1 --- /dev/null +++ b/collectors/likwid/groups/westmere/MEM.txt @@ -0,0 +1,50 @@ +SHORT Main memory bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +UPMC0 UNC_QMC_NORMAL_READS_ANY +UPMC1 UNC_QMC_WRITES_FULL_ANY +UPMC2 UNC_QHL_REQUESTS_REMOTE_READS +UPMC3 UNC_QHL_REQUESTS_REMOTE_WRITES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Memory read bandwidth [MBytes/s] 1.0E-06*UPMC0*64.0/time +Memory read data volume [GBytes] 1.0E-09*UPMC0*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*UPMC1*64.0/time +Memory write data volume [GBytes] 1.0E-09*UPMC1*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64.0 +Remote memory read bandwidth [MBytes/s] 1.0E-06*UPMC2*64.0/time +Remote memory read data volume [GBytes] 1.0E-09*UPMC2*64.0 +Remote memory write bandwidth [MBytes/s] 1.0E-06*UPMC3*64.0/time +Remote memory write data volume [GBytes] 1.0E-09*UPMC3*64.0 +Remote memory bandwidth [MBytes/s] 1.0E-06*(UPMC2+UPMC3)*64.0/time +Remote memory data volume [GBytes] 1.0E-09*(UPMC2+UPMC3)*64.0 + +LONG +Formulas: +Memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_NORMAL_READS_ANY*64.0/time +Memory read data volume [GBytes] = 1.0E-09*UNC_QMC_NORMAL_READS_ANY*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_WRITES_FULL_ANY*64.0/time +Memory write data volume [GBytes] = 1.0E-09*UNC_QMC_WRITES_FULL_ANY*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0/time +Memory data volume [GBytes] = 1.0E-09*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0 +Remote memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_READS*64.0/time +Remote memory read data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_READS*64.0 +Remote memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0/time +Remote memory write data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0 +Remote memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0/time +Remote memory data volume [GBytes] = 1.0E-09*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0 +- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +This group will be measured by one core per socket. The remote read BW tells +you if cache lines are transferred between sockets, meaning that cores access +data owned by a remote NUMA domain. The group also reports total data volume +transferred from main memory. + diff --git a/collectors/likwid/groups/westmere/MEM_DP.txt b/collectors/likwid/groups/westmere/MEM_DP.txt new file mode 100644 index 0000000..64161dd --- /dev/null +++ b/collectors/likwid/groups/westmere/MEM_DP.txt @@ -0,0 +1,66 @@ +SHORT Main memory bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED +PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR +PMC2 FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION +PMC3 FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION +UPMC0 UNC_QMC_NORMAL_READS_ANY +UPMC1 UNC_QMC_WRITES_FULL_ANY +UPMC2 UNC_QHL_REQUESTS_REMOTE_READS +UPMC3 UNC_QHL_REQUESTS_REMOTE_WRITES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1)/time +Packed [MUOPS/s] 1.0E-06*PMC0/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +SP [MUOPS/s] 1.0E-06*PMC2/time +DP [MUOPS/s] 1.0E-06*PMC3/time +Memory read bandwidth [MBytes/s] 1.0E-06*UPMC0*64.0/time +Memory read data volume [GBytes] 1.0E-09*UPMC0*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*UPMC1*64.0/time +Memory write data volume [GBytes] 1.0E-09*UPMC1*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64.0 +Remote memory read bandwidth [MBytes/s] 1.0E-06*UPMC2*64.0/time +Remote memory read data volume [GBytes] 1.0E-09*UPMC2*64.0 +Remote memory write bandwidth [MBytes/s] 1.0E-06*UPMC3*64.0/time +Remote memory write data volume [GBytes] 1.0E-09*UPMC3*64.0 +Remote memory bandwidth [MBytes/s] 1.0E-06*(UPMC2+UPMC3)*64.0/time +Remote memory data volume [GBytes] 1.0E-09*(UPMC2+UPMC3)*64.0 +Operational intensity (PMC0*2.0+PMC1)/((UPMC0+UPMC1)*64.0) + +LONG +Formulas: +DP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime +Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime +SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime +DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime +Memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_NORMAL_READS_ANY*64.0/time +Memory read data volume [GBytes] = 1.0E-09*UNC_QMC_NORMAL_READS_ANY*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_WRITES_FULL_ANY*64.0/time +Memory write data volume [GBytes] = 1.0E-09*UNC_QMC_WRITES_FULL_ANY*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0/time +Memory data volume [GBytes] = 1.0E-09*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0 +Remote memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_READS*64.0/time +Remote memory read data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_READS*64.0 +Remote memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0/time +Remote memory write data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0 +Remote memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0/time +Remote memory data volume [GBytes] = 1.0E-09*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0 +Operational intensity = (FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/((UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0) +- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +This group will be measured by one core per socket. The remote read BW tells +you if cache lines are transferred between sockets, meaning that cores access +data owned by a remote NUMA domain. The group also reports total data volume +transferred from main memory. + diff --git a/collectors/likwid/groups/westmere/MEM_SP.txt b/collectors/likwid/groups/westmere/MEM_SP.txt new file mode 100644 index 0000000..812c7fa --- /dev/null +++ b/collectors/likwid/groups/westmere/MEM_SP.txt @@ -0,0 +1,66 @@ +SHORT Main memory bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED +PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR +PMC2 FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION +PMC3 FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION +UPMC0 UNC_QMC_NORMAL_READS_ANY +UPMC1 UNC_QMC_WRITES_FULL_ANY +UPMC2 UNC_QHL_REQUESTS_REMOTE_READS +UPMC3 UNC_QHL_REQUESTS_REMOTE_WRITES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1)/time +Packed [MUOPS/s] 1.0E-06*PMC0/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +SP [MUOPS/s] 1.0E-06*PMC2/time +DP [MUOPS/s] 1.0E-06*PMC3/time +Memory read bandwidth [MBytes/s] 1.0E-06*UPMC0*64.0/time +Memory read data volume [GBytes] 1.0E-09*UPMC0*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*UPMC1*64.0/time +Memory write data volume [GBytes] 1.0E-09*UPMC1*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64.0 +Remote memory read bandwidth [MBytes/s] 1.0E-06*UPMC2*64.0/time +Remote memory read data volume [GBytes] 1.0E-09*UPMC2*64.0 +Remote memory write bandwidth [MBytes/s] 1.0E-06*UPMC3*64.0/time +Remote memory write data volume [GBytes] 1.0E-09*UPMC3*64.0 +Remote memory bandwidth [MBytes/s] 1.0E-06*(UPMC2+UPMC3)*64.0/time +Remote memory data volume [GBytes] 1.0E-09*(UPMC2+UPMC3)*64.0 +Operational intensity (PMC0*4.0+PMC1)/((UPMC0+UPMC1)*64.0) + +LONG +Formulas: +SP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime +Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime +SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime +DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime +Memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_NORMAL_READS_ANY*64.0/time +Memory read data volume [GBytes] = 1.0E-09*UNC_QMC_NORMAL_READS_ANY*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_WRITES_FULL_ANY*64.0/time +Memory write data volume [GBytes] = 1.0E-09*UNC_QMC_WRITES_FULL_ANY*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0/time +Memory data volume [GBytes] = 1.0E-09*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0 +Remote memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_READS*64.0/time +Remote memory read data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_READS*64.0 +Remote memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0/time +Remote memory write data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0 +Remote memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0/time +Remote memory data volume [GBytes] = 1.0E-09*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0 +Operational intensity = (FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/((UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0) +- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +This group will be measured by one core per socket. The remote read BW tells +you if cache lines are transferred between sockets, meaning that cores access +data owned by a remote NUMA domain. The group also reports total data volume +transferred from main memory. + diff --git a/collectors/likwid/groups/westmere/TLB_DATA.txt b/collectors/likwid/groups/westmere/TLB_DATA.txt new file mode 100644 index 0000000..d256b8c --- /dev/null +++ b/collectors/likwid/groups/westmere/TLB_DATA.txt @@ -0,0 +1,35 @@ +SHORT L2 data TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 DTLB_LOAD_MISSES_ANY +PMC1 DTLB_MISSES_ANY +PMC2 DTLB_LOAD_MISSES_WALK_CYCLES +PMC3 DTLB_MISSES_WALK_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 DTLB load misses PMC0 +L1 DTLB load miss rate PMC0/FIXC0 +L1 DTLB load miss duration [Cyc] PMC2/PMC0 +L1 DTLB store misses (PMC1-PMC0) +L1 DTLB store miss rate (PMC1-PMC0)/FIXC0 +L1 DTLB store miss duration [Cyc] (PMC3-PMC2)/(PMC1-PMC0) + +LONG +Formulas: +L1 DTLB load misses = DTLB_LOAD_MISSES_ANY +L1 DTLB load miss rate = DTLB_LOAD_MISSES_ANY / INSTR_RETIRED_ANY +L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_CYCLES / DTLB_LOAD_MISSES_ANY +L1 DTLB store misses = DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY +L1 DTLB store miss rate = (DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY) / INSTR_RETIRED_ANY +L1 DTLB store miss duration [Cyc] = (DTLB_MISSES_WALK_CYCLES-DTLB_LOAD_MISSES_WALK_CYCLES) / (DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY) +- +The DTLB miss rate gives a measure how often a TLB miss occurred +per instruction. The store miss calculations are done using ALL-LOADS TLB walks. + diff --git a/collectors/likwid/groups/westmere/TLB_INSTR.txt b/collectors/likwid/groups/westmere/TLB_INSTR.txt new file mode 100644 index 0000000..2f0f90c --- /dev/null +++ b/collectors/likwid/groups/westmere/TLB_INSTR.txt @@ -0,0 +1,27 @@ +SHORT L1 Instruction TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ITLB_MISSES_ANY +PMC1 ITLB_MISSES_WALK_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 ITLB misses PMC0 +L1 ITLB miss rate PMC0/FIXC0 +L1 ITLB miss duration [Cyc] PMC1/PMC0 + +LONG +Formulas: +L1 ITLB misses = ITLB_MISSES_ANY +L1 ITLB miss rate = ITLB_MISSES_ANY / INSTR_RETIRED_ANY +L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_CYCLES / ITLB_MISSES_ANY +- +The ITLB miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/westmere/UOPS.txt b/collectors/likwid/groups/westmere/UOPS.txt new file mode 100644 index 0000000..b2446aa --- /dev/null +++ b/collectors/likwid/groups/westmere/UOPS.txt @@ -0,0 +1,35 @@ +SHORT UOPs execution info + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_ANY +PMC1 UOPS_EXECUTED_THREAD +PMC2 UOPS_RETIRED_ANY +PMC3 UOPS_ISSUED_FUSED + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Issued UOPs PMC0 +Merged UOPs PMC3 +Executed UOPs PMC1 +Retired UOPs PMC2 + +LONG +Formulas: +Issued UOPs = UOPS_ISSUED_ANY +Merged UOPs = UOPS_ISSUED_FUSED +Executed UOPs = UOPS_EXECUTED_THREAD +Retired UOPs = UOPS_RETIRED_ANY +- +This group returns information about the instruction pipeline. It measures the +issued, executed and retired uOPs and returns the number of uOPs which were issued +but not executed as well as the number of uOPs which were executed but never retired. +The executed but not retired uOPs commonly come from speculatively executed branches. + diff --git a/collectors/likwid/groups/westmere/VIEW.txt b/collectors/likwid/groups/westmere/VIEW.txt new file mode 100644 index 0000000..38d907c --- /dev/null +++ b/collectors/likwid/groups/westmere/VIEW.txt @@ -0,0 +1,50 @@ +SHORT Overview of arithmetic and memory performance + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED +PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR +PMC2 FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION +PMC3 FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION +UPMC0 UNC_QMC_NORMAL_READS_ANY +UPMC1 UNC_QMC_WRITES_FULL_ANY +UPMC2 UNC_QHL_REQUESTS_REMOTE_READS +UPMC3 UNC_QHL_REQUESTS_LOCAL_READS +UPMC4 UNC_QHL_REQUESTS_REMOTE_WRITES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +DP [MFLOP/s] (DP assumed) 1.0E-06*(PMC0*2.0+PMC1)/time +SP [MFLOP/s] (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time +Packed [MUOPS/s] 1.0E-06*PMC0/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +SP [MUOPS/s] 1.0E-06*PMC2/time +DP [MUOPS/s] 1.0E-06*PMC3/time +Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64/time +Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64 +Remote Read BW [MBytes/s] 1.0E-06*(UPMC2)*64/time +Remote Write BW [MBytes/s] 1.0E-06*(UPMC4)*64/time +Remote BW [MBytes/s] 1.0E-06*(UPMC2+UPMC4)*64/time + +LONG +Formulas: +DP [MFLOP/s] = (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 + FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime +SP [MFLOP/s] = (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 + FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime +Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/time +Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/time +SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/time +DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/time +Memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64/time +Memory data volume [GBytes] = 1.0E-09*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64 +Remote Read BW [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS)*64/time +Remote Write BW [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time +Remote BW [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time +- +This is a overview group using the capabilities of Westmere to measure multiple events at +the same time. + diff --git a/collectors/likwid/groups/westmereEX/BRANCH.txt b/collectors/likwid/groups/westmereEX/BRANCH.txt new file mode 100644 index 0000000..b8d41b2 --- /dev/null +++ b/collectors/likwid/groups/westmereEX/BRANCH.txt @@ -0,0 +1,31 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 BR_INST_RETIRED_ALL_BRANCHES +PMC1 BR_MISP_RETIRED_ALL_BRANCHES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Branch rate PMC0/FIXC0 +Branch misprediction rate PMC1/FIXC0 +Branch misprediction ratio PMC1/PMC0 +Instructions per branch FIXC0/PMC0 + +LONG +Formulas: +Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY +Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES +Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/westmereEX/CACHE.txt b/collectors/likwid/groups/westmereEX/CACHE.txt new file mode 100644 index 0000000..eb160f6 --- /dev/null +++ b/collectors/likwid/groups/westmereEX/CACHE.txt @@ -0,0 +1,25 @@ +SHORT Data cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +data cache misses PMC0 +data cache miss rate PMC0/FIXC0 + +LONG +Formulas: +data cache misses = L1D_REPL +data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY +- +This group measures the locality of your data accesses with regard to the L1 +cache. The data cache miss rate gives a measure how often it was necessary to +get cache lines from higher levels of the memory hierarchy. + diff --git a/collectors/likwid/groups/westmereEX/DATA.txt b/collectors/likwid/groups/westmereEX/DATA.txt new file mode 100644 index 0000000..31bba51 --- /dev/null +++ b/collectors/likwid/groups/westmereEX/DATA.txt @@ -0,0 +1,22 @@ +SHORT Load to store ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 MEM_INST_RETIRED_LOADS +PMC1 MEM_INST_RETIRED_STORES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Load to store ratio PMC0/PMC1 + +LONG +Formulas: +Load to store ratio = MEM_INST_RETIRED_LOADS/MEM_INST_RETIRED_STORES +- +This is a simple metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/westmereEX/DIVIDE.txt b/collectors/likwid/groups/westmereEX/DIVIDE.txt new file mode 100644 index 0000000..2677a19 --- /dev/null +++ b/collectors/likwid/groups/westmereEX/DIVIDE.txt @@ -0,0 +1,24 @@ +SHORT Divide unit information + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ARITH_NUM_DIV +PMC1 ARITH_CYCLES_DIV_BUSY + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Number of divide ops PMC0 +Avg. divide unit usage duration PMC1/PMC0 + +LONG +Formulas: +Number of divide ops = ARITH_NUM_DIV +Avg. divide unit usage duration = ARITH_CYCLES_DIV_BUSY/ARITH_NUM_DIV +- +This performance group measures the average latency of divide operations diff --git a/collectors/likwid/groups/westmereEX/FLOPS_DP.txt b/collectors/likwid/groups/westmereEX/FLOPS_DP.txt new file mode 100644 index 0000000..0c2e56c --- /dev/null +++ b/collectors/likwid/groups/westmereEX/FLOPS_DP.txt @@ -0,0 +1,35 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED +PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR +PMC2 FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION +PMC3 FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1)/time +Packed [MUOPS/s] 1.0E-06*PMC0/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +SP [MUOPS/s] 1.0E-06*PMC2/time +DP [MUOPS/s] 1.0E-06*PMC3/time + +LONG +Formulas: +DP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime +Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime +SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime +DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime +- +The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done. +Therefore both single as well as double precision are measured to ensure the correctness +of the measurements. You can check if your code was vectorized on the number of +FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR. + diff --git a/collectors/likwid/groups/westmereEX/FLOPS_SP.txt b/collectors/likwid/groups/westmereEX/FLOPS_SP.txt new file mode 100644 index 0000000..d7c8e8e --- /dev/null +++ b/collectors/likwid/groups/westmereEX/FLOPS_SP.txt @@ -0,0 +1,35 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 FP_COMP_OPS_EXE_SSE_FP_PACKED +PMC1 FP_COMP_OPS_EXE_SSE_FP_SCALAR +PMC2 FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION +PMC3 FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1)/time +Packed [MUOPS/s] 1.0E-06*PMC0/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +SP [MUOPS/s] 1.0E-06*PMC2/time +DP [MUOPS/s] 1.0E-06*PMC3/time + +LONG +Formulas: +SP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime +Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime +SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime +DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime +- +The Westmere EX has no possibility to measure MFLOPs if mixed precision calculations are done. +Therefore both single as well as double precision are measured to ensure the correctness +of the measurements. You can check if your code was vectorized on the number of +FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR. + diff --git a/collectors/likwid/groups/westmereEX/FLOPS_X87.txt b/collectors/likwid/groups/westmereEX/FLOPS_X87.txt new file mode 100644 index 0000000..39cd8b4 --- /dev/null +++ b/collectors/likwid/groups/westmereEX/FLOPS_X87.txt @@ -0,0 +1,21 @@ +SHORT X87 MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 INST_RETIRED_X87 + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +X87 [MFLOP/s] 1.0E-06*PMC0/time + +LONG +Formulas: +X87 [MFLOP/s] = 1.0E-06*INST_RETIRED_X87/runtime +- +Profiling group to measure X87 FLOP rate. + diff --git a/collectors/likwid/groups/westmereEX/ICACHE.txt b/collectors/likwid/groups/westmereEX/ICACHE.txt new file mode 100644 index 0000000..49943ff --- /dev/null +++ b/collectors/likwid/groups/westmereEX/ICACHE.txt @@ -0,0 +1,25 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1I_READS +PMC1 L1I_MISSES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1I request rate PMC0/FIXC0 +L1I miss rate PMC1/FIXC0 +L1I miss ratio PMC1/PMC0 + +LONG +Formulas: +L1I request rate = L1I_READS / INSTR_RETIRED_ANY +L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY +L1I miss ratio = ICACHE_MISSES / L1I_READS +- +This group measures some L1 instruction cache metrics. diff --git a/collectors/likwid/groups/westmereEX/L2.txt b/collectors/likwid/groups/westmereEX/L2.txt new file mode 100644 index 0000000..e950021 --- /dev/null +++ b/collectors/likwid/groups/westmereEX/L2.txt @@ -0,0 +1,38 @@ +SHORT L2 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L1D_REPL +PMC1 L1D_M_EVICT +PMC2 L1I_MISSES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2D load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L2D load data volume [GBytes] 1.0E-09*PMC0*64.0 +L2D evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L2D evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time +L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0 + +LONG +Formulas: +L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPL*64.0/time +L2D load data volume [GBytes] = 1.0E-09*L1D_REPL*64.0 +L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time +L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0 +L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64/time +L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64 +- +Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the +number of cache line allocated in the L1 and the number of modified cache lines +evicted from the L1. Also reports on total data volume transferred between L2 +and L1 cache. Note that this bandwidth also includes data transfers due to a +write allocate load on a store miss in L1 and traffic caused by misses in the +instruction cache. + diff --git a/collectors/likwid/groups/westmereEX/L2CACHE.txt b/collectors/likwid/groups/westmereEX/L2CACHE.txt new file mode 100644 index 0000000..343b263 --- /dev/null +++ b/collectors/likwid/groups/westmereEX/L2CACHE.txt @@ -0,0 +1,34 @@ +SHORT L2 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_RQSTS_REFERENCES +PMC1 L2_RQSTS_MISS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L2 request rate PMC0/FIXC0 +L2 miss rate PMC1/FIXC0 +L2 miss ratio PMC1/PMC0 + +LONG +Formulas: +L2 request rate = L2_RQSTS_REFERENCES/INSTR_RETIRED_ANY +L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY +L2 miss ratio = L2_RQSTS_MISS/L2_RQSTS_REFERENCES +- +This group measures the locality of your data accesses with regard to the +L2 cache. L2 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L2 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L2 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/westmereEX/L3.txt b/collectors/likwid/groups/westmereEX/L3.txt new file mode 100644 index 0000000..7e5cb04 --- /dev/null +++ b/collectors/likwid/groups/westmereEX/L3.txt @@ -0,0 +1,36 @@ +SHORT L3 cache bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 L2_LINES_IN_ANY +PMC1 L2_LINES_OUT_DEMAND_DIRTY + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time +L3 load data volume [GBytes] 1.0E-09*PMC0*64.0 +L3 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time +L3 evict data volume [GBytes] 1.0E-09*PMC1*64.0 +L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time +L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 + +LONG +Formulas: +L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ANY*64.0/time +L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ANY*64.0 +L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64.0/time +L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DEMAND_DIRTY*64.0 +L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64/time +L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64 +- +Profiling group to measure L3 cache bandwidth. The bandwidth is +computed by the number of cache line allocated in the L2 and the number of +modified cache lines evicted from the L2. Also reporto data volume transferred +between L3 and L2 caches. Note that this bandwidth also includes data transfers +due to a write allocate load on a store miss in L2. + diff --git a/collectors/likwid/groups/westmereEX/L3CACHE.txt b/collectors/likwid/groups/westmereEX/L3CACHE.txt new file mode 100644 index 0000000..262f948 --- /dev/null +++ b/collectors/likwid/groups/westmereEX/L3CACHE.txt @@ -0,0 +1,52 @@ +SHORT L3 cache miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +CBOX0C0 LLC_HITS_ALL +CBOX0C1 LLC_MISSES_ALL +CBOX1C0 LLC_HITS_ALL +CBOX1C1 LLC_MISSES_ALL +CBOX2C0 LLC_HITS_ALL +CBOX2C1 LLC_MISSES_ALL +CBOX3C0 LLC_HITS_ALL +CBOX3C1 LLC_MISSES_ALL +CBOX4C0 LLC_HITS_ALL +CBOX4C1 LLC_MISSES_ALL +CBOX5C0 LLC_HITS_ALL +CBOX5C1 LLC_MISSES_ALL +CBOX6C0 LLC_HITS_ALL +CBOX6C1 LLC_MISSES_ALL +CBOX7C0 LLC_HITS_ALL +CBOX7C1 LLC_MISSES_ALL +CBOX8C0 LLC_HITS_ALL +CBOX8C1 LLC_MISSES_ALL +CBOX9C0 LLC_HITS_ALL +CBOX9C1 LLC_MISSES_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L3 request rate (CBOX0C0+CBOX0C1+CBOX1C0+CBOX1C1+CBOX2C0+CBOX2C1+CBOX3C0+CBOX3C1+CBOX4C0+CBOX4C1+CBOX5C0+CBOX5C1+CBOX6C0+CBOX6C1+CBOX7C0+CBOX7C1+CBOX8C0+CBOX8C1+CBOX9C0+CBOX9C1)/FIXC0 +L3 miss rate (CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1)/FIXC0 +L3 miss ratio (CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1)/(CBOX0C0+CBOX0C1+CBOX1C0+CBOX1C1+CBOX2C0+CBOX2C1+CBOX3C0+CBOX3C1+CBOX4C0+CBOX4C1+CBOX5C0+CBOX5C1+CBOX6C0+CBOX6C1+CBOX7C0+CBOX7C1+CBOX8C0+CBOX8C1+CBOX9C0+CBOX9C1) + +LONG +Formulas: +L3 request rate = (SUM(LLC_HITS_ALL)+SUM(LLC_MISSES_ALL))/INSTR_RETIRED_ANY +L3 miss rate = SUM(LLC_MISSES_ALL)/INSTR_RETIRED_ANY +L3 miss ratio = SUM(LLC_MISSES_ALL)/(SUM(LLC_HITS_ALL)+SUM(LLC_MISSES_ALL)) +- +This group measures the locality of your data accesses with regard to the +L3 cache. L3 request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The L3 miss rate gives a measure how often it was necessary to get +cache lines from memory. And finally L3 miss ratio tells you how many of your +memory references required a cache line to be loaded from a higher level. +While the data cache miss rate might be given by your algorithm you should +try to get data cache miss ratio as low as possible by increasing your cache reuse. + + diff --git a/collectors/likwid/groups/westmereEX/MEM.txt b/collectors/likwid/groups/westmereEX/MEM.txt new file mode 100644 index 0000000..5d4fc62 --- /dev/null +++ b/collectors/likwid/groups/westmereEX/MEM.txt @@ -0,0 +1,38 @@ +SHORT Main memory bandwidth in MBytes/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +MBOX0C0 FVC_EV0_BBOX_CMDS_READS +MBOX0C1 DRAM_CMD_CAS_WR_OPN +MBOX0C2 DRAM_MISC_CAS_WR_CLS +MBOX1C0 FVC_EV0_BBOX_CMDS_READS +MBOX1C1 DRAM_CMD_CAS_WR_OPN +MBOX1C2 DRAM_MISC_CAS_WR_CLS + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX0C2+MBOX1C1+MBOX1C2)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX0C2+MBOX1C1+MBOX1C2)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1+MBOX0C2+MBOX1C2)*64/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1+MBOX0C2+MBOX1C2)*64 + +LONG +Formulas: +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time +Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1)+SUM(MBOXxC2))*64.0/time +Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1)+SUM(MBOXxC2))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1)+SUM(MBOXxC2))*64.0/time +Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1)+SUM(MBOXxC2))*64.0 +- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Addional to the bandwidth it also outputs the data volume. + diff --git a/collectors/likwid/groups/westmereEX/NUMA.txt b/collectors/likwid/groups/westmereEX/NUMA.txt new file mode 100644 index 0000000..41fbe62 --- /dev/null +++ b/collectors/likwid/groups/westmereEX/NUMA.txt @@ -0,0 +1,33 @@ +SHORT Local and remote memory accesses + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 OFFCORE_RESPONSE_0_LOCAL_DRAM +PMC1 OFFCORE_RESPONSE_1_REMOTE_DRAM + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Local DRAM data volume [GByte] 1.E-09*PMC0*64 +Local DRAM bandwidth [MByte/s] 1.E-06*(PMC0*64)/time +Remote DRAM data volume [GByte] 1.E-09*PMC1*64 +Remote DRAM bandwidth [MByte/s] 1.E-06*(PMC1*64)/time +Memory data volume [GByte] 1.E-09*(PMC0+PMC1)*64 +Memory bandwidth [MByte/s] 1.E-06*((PMC0+PMC1)*64)/time + +LONG +Formulas: +CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY +Local DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_0_LOCAL_DRAM*64 +Local DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_0_LOCAL_DRAM*64)/time +Remote DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_1_REMOTE_DRAM*64 +Remote DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_1_REMOTE_DRAM*64)/time +Memory data volume [GByte] = 1.E-09*(OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64 +Memory bandwidth [MByte/s] = 1.E-06*((OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64)/time +-- +This performance group measures the data traffic of CPU cores to local and remote +memory. diff --git a/collectors/likwid/groups/westmereEX/TLB_DATA.txt b/collectors/likwid/groups/westmereEX/TLB_DATA.txt new file mode 100644 index 0000000..d256b8c --- /dev/null +++ b/collectors/likwid/groups/westmereEX/TLB_DATA.txt @@ -0,0 +1,35 @@ +SHORT L2 data TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 DTLB_LOAD_MISSES_ANY +PMC1 DTLB_MISSES_ANY +PMC2 DTLB_LOAD_MISSES_WALK_CYCLES +PMC3 DTLB_MISSES_WALK_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 DTLB load misses PMC0 +L1 DTLB load miss rate PMC0/FIXC0 +L1 DTLB load miss duration [Cyc] PMC2/PMC0 +L1 DTLB store misses (PMC1-PMC0) +L1 DTLB store miss rate (PMC1-PMC0)/FIXC0 +L1 DTLB store miss duration [Cyc] (PMC3-PMC2)/(PMC1-PMC0) + +LONG +Formulas: +L1 DTLB load misses = DTLB_LOAD_MISSES_ANY +L1 DTLB load miss rate = DTLB_LOAD_MISSES_ANY / INSTR_RETIRED_ANY +L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_CYCLES / DTLB_LOAD_MISSES_ANY +L1 DTLB store misses = DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY +L1 DTLB store miss rate = (DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY) / INSTR_RETIRED_ANY +L1 DTLB store miss duration [Cyc] = (DTLB_MISSES_WALK_CYCLES-DTLB_LOAD_MISSES_WALK_CYCLES) / (DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY) +- +The DTLB miss rate gives a measure how often a TLB miss occurred +per instruction. The store miss calculations are done using ALL-LOADS TLB walks. + diff --git a/collectors/likwid/groups/westmereEX/TLB_INSTR.txt b/collectors/likwid/groups/westmereEX/TLB_INSTR.txt new file mode 100644 index 0000000..2f0f90c --- /dev/null +++ b/collectors/likwid/groups/westmereEX/TLB_INSTR.txt @@ -0,0 +1,27 @@ +SHORT L1 Instruction TLB miss rate/ratio + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 ITLB_MISSES_ANY +PMC1 ITLB_MISSES_WALK_CYCLES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +L1 ITLB misses PMC0 +L1 ITLB miss rate PMC0/FIXC0 +L1 ITLB miss duration [Cyc] PMC1/PMC0 + +LONG +Formulas: +L1 ITLB misses = ITLB_MISSES_ANY +L1 ITLB miss rate = ITLB_MISSES_ANY / INSTR_RETIRED_ANY +L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_CYCLES / ITLB_MISSES_ANY +- +The ITLB miss rates gives a measure how often a TLB miss occurred +per instruction. The duration measures the time in cycles how long a walk did take. + diff --git a/collectors/likwid/groups/westmereEX/UOPS.txt b/collectors/likwid/groups/westmereEX/UOPS.txt new file mode 100644 index 0000000..f58fda6 --- /dev/null +++ b/collectors/likwid/groups/westmereEX/UOPS.txt @@ -0,0 +1,32 @@ +SHORT UOPs execution info + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +PMC0 UOPS_ISSUED_ANY +PMC2 UOPS_RETIRED_ANY +PMC3 UOPS_ISSUED_FUSED + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Issued UOPs PMC0 +Merged UOPs PMC3 +Retired UOPs PMC2 + +LONG +Formulas: +Issued UOPs = UOPS_ISSUED_ANY +Merged UOPs = UOPS_ISSUED_FUSED +Retired UOPs = UOPS_RETIRED_ANY +- +This group returns information about the instruction pipeline. It measures the +issued, executed and retired uOPs and returns the number of uOPs which were issued +but not executed as well as the number of uOPs which were executed but never retired. +The executed but not retired uOPs commonly come from speculatively executed branches. + diff --git a/collectors/likwid/groups/zen/BRANCH.txt b/collectors/likwid/groups/zen/BRANCH.txt new file mode 100644 index 0000000..dbaf07f --- /dev/null +++ b/collectors/likwid/groups/zen/BRANCH.txt @@ -0,0 +1,32 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +PMC2 RETIRED_BRANCH_INSTR +PMC3 RETIRED_MISP_BRANCH_INSTR + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI PMC1/PMC0 +Branch rate PMC2/PMC0 +Branch misprediction rate PMC3/PMC0 +Branch misprediction ratio PMC3/PMC2 +Instructions per branch PMC0/PMC2 + +LONG +Formulas: +Branch rate = RETIRED_BRANCH_INSTR/RETIRED_INSTRUCTIONS +Branch misprediction rate = RETIRED_MISP_BRANCH_INSTR/RETIRED_INSTRUCTIONS +Branch misprediction ratio = RETIRED_MISP_BRANCH_INSTR/RETIRED_BRANCH_INSTR +Instructions per branch = RETIRED_INSTRUCTIONS/RETIRED_BRANCH_INSTR +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/zen/CACHE.txt b/collectors/likwid/groups/zen/CACHE.txt new file mode 100644 index 0000000..b773e5a --- /dev/null +++ b/collectors/likwid/groups/zen/CACHE.txt @@ -0,0 +1,39 @@ +SHORT Data cache miss rate/ratio + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +PMC2 DATA_CACHE_ACCESSES +PMC3 DATA_CACHE_REFILLS_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI PMC1/PMC0 +data cache requests PMC2 +data cache request rate PMC2/PMC0 +data cache misses PMC3 +data cache miss rate PMC3/PMC0 +data cache miss ratio PMC3/PMC2 + +LONG +Formulas: +data cache requests = DATA_CACHE_ACCESSES +data cache request rate = DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS +data cache misses = DATA_CACHE_REFILLS_ALL +data cache miss rate = DATA_CACHE_REFILLS_ALL / RETIRED_INSTRUCTIONS +data cache miss ratio = DATA_CACHE_REFILLS_ALL / DATA_CACHE_ACCESSES +- +This group measures the locality of your data accesses with regard to the +L1 cache. Data cache request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The data cache miss rate gives a measure how often it was necessary to get +cache lines from higher levels of the memory hierarchy. And finally +data cache miss ratio tells you how many of your memory references required +a cache line to be loaded from a higher level. While the# data cache miss rate +might be given by your algorithm you should try to get data cache miss ratio +as low as possible by increasing your cache reuse. + diff --git a/collectors/likwid/groups/zen/CPI.txt b/collectors/likwid/groups/zen/CPI.txt new file mode 100644 index 0000000..23e4f8c --- /dev/null +++ b/collectors/likwid/groups/zen/CPI.txt @@ -0,0 +1,30 @@ +SHORT Cycles per instruction + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +PMC2 RETIRED_UOPS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] PMC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI PMC1/PMC0 +CPI (based on uops) PMC1/PMC2 +IPC PMC0/PMC1 + + +LONG +Formulas: +CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS +CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS +IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED +- +This group measures how efficient the processor works with +regard to instruction throughput. Also important as a standalone +metric is RETIRED_INSTRUCTIONS as it tells you how many instruction +you need to execute for a task. An optimization might show very +low CPI values but execute many more instruction for it. + diff --git a/collectors/likwid/groups/zen/DATA.txt b/collectors/likwid/groups/zen/DATA.txt new file mode 100644 index 0000000..e061b90 --- /dev/null +++ b/collectors/likwid/groups/zen/DATA.txt @@ -0,0 +1,23 @@ +SHORT Load to store ratio + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +PMC2 LS_DISPATCH_LOADS +PMC3 LS_DISPATCH_STORES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI PMC1/PMC0 +Load to store ratio PMC2/PMC3 + +LONG +Formulas: +Load to store ratio = LS_DISPATCH_LOADS/LS_DISPATCH_STORES +- +This is a simple metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/zen/DIVIDE.txt b/collectors/likwid/groups/zen/DIVIDE.txt new file mode 100644 index 0000000..c98500b --- /dev/null +++ b/collectors/likwid/groups/zen/DIVIDE.txt @@ -0,0 +1,26 @@ +SHORT Divide unit information + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +PMC2 DIV_OP_COUNT +PMC3 DIV_BUSY_CYCLES + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI PMC1/PMC0 +Number of divide ops PMC2 +Avg. divide unit usage duration PMC3/PMC2 + +LONG +Formulas: +CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS +Number of divide ops = DIV_OP_COUNT +Avg. divide unit usage duration = DIV_BUSY_CYCLES/DIV_OP_COUNT +- +This performance group measures the average latency of divide operations diff --git a/collectors/likwid/groups/zen/ENERGY.txt b/collectors/likwid/groups/zen/ENERGY.txt new file mode 100644 index 0000000..f58c5b1 --- /dev/null +++ b/collectors/likwid/groups/zen/ENERGY.txt @@ -0,0 +1,32 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +PWR0 RAPL_CORE_ENERGY +PWR1 RAPL_PKG_ENERGY + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI PMC1/PMC0 +Energy Core [J] PWR0 +Power Core [W] PWR0/time +Energy PKG [J] PWR1 +Power PKG [W] PWR1/time + +LONG +Formulas: +Power Core [W] = RAPL_CORE_ENERGY/time +Power PKG [W] = RAPL_PKG_ENERGY/time +- +Ryzen implements the RAPL interface previously introduced by Intel. +This interface enables to monitor the consumed energy on the core and package +domain. +It is not documented by AMD which parts of the CPU are in which domain. + diff --git a/collectors/likwid/groups/zen/FLOPS_DP.txt b/collectors/likwid/groups/zen/FLOPS_DP.txt new file mode 100644 index 0000000..420f942 --- /dev/null +++ b/collectors/likwid/groups/zen/FLOPS_DP.txt @@ -0,0 +1,26 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +PMC2 RETIRED_SSE_AVX_FLOPS_DOUBLE_ALL +PMC3 MERGE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI PMC1/PMC0 +DP [MFLOP/s] 1.0E-06*(PMC2)/time + +LONG +Formulas: +CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS +DP [MFLOP/s] = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_DOUBLE_ALL)/time +- +Profiling group to measure double precisision FLOP rate. The event might +have a higher per-cycle increment than 15, so the MERGE event is required. + + diff --git a/collectors/likwid/groups/zen/FLOPS_SP.txt b/collectors/likwid/groups/zen/FLOPS_SP.txt new file mode 100644 index 0000000..1f64af1 --- /dev/null +++ b/collectors/likwid/groups/zen/FLOPS_SP.txt @@ -0,0 +1,26 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +PMC2 RETIRED_SSE_AVX_FLOPS_SINGLE_ALL +PMC3 MERGE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI PMC1/PMC0 +SP [MFLOP/s] 1.0E-06*(PMC2)/time + +LONG +Formulas: +CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS +SP [MFLOP/s] = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_SINGLE_ALL)/time +- +Profiling group to measure single precisision FLOP rate. The event might +have a higher per-cycle increment than 15, so the MERGE event is required. + + diff --git a/collectors/likwid/groups/zen/ICACHE.txt b/collectors/likwid/groups/zen/ICACHE.txt new file mode 100644 index 0000000..f98c28a --- /dev/null +++ b/collectors/likwid/groups/zen/ICACHE.txt @@ -0,0 +1,28 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 ICACHE_FETCHES +PMC2 ICACHE_L2_REFILLS +PMC3 ICACHE_SYSTEM_REFILLS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/PMC0 +L1I request rate PMC1/PMC0 +L1I miss rate (PMC2+PMC3)/PMC0 +L1I miss ratio (PMC2+PMC3)/PMC1 + +LONG +Formulas: +L1I request rate = ICACHE_FETCHES / RETIRED_INSTRUCTIONS +L1I miss rate = (ICACHE_L2_REFILLS + ICACHE_SYSTEM_REFILLS)/RETIRED_INSTRUCTIONS +L1I miss ratio = (ICACHE_L2_REFILLS + ICACHE_SYSTEM_REFILLS)/ICACHE_FETCHES +- +This group measures the locality of your instruction code with regard to the +L1 I-Cache. + diff --git a/collectors/likwid/groups/zen/L2.txt b/collectors/likwid/groups/zen/L2.txt new file mode 100644 index 0000000..420e34d --- /dev/null +++ b/collectors/likwid/groups/zen/L2.txt @@ -0,0 +1,28 @@ +SHORT L2 cache bandwidth in MBytes/s (experimental) + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +PMC3 REQUESTS_TO_L2_GRP1_ALL_NO_PF + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI PMC1/PMC0 +L2D load bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time +L2D load data volume [GBytes] 1.0E-09*PMC3*64.0 +L2 bandwidth [MBytes/s] 1.0E-06*(PMC3)*64.0/time +L2 data volume [GBytes] 1.0E-09*(PMC3)*64.0 + +LONG +Formulas: +L2D load bandwidth [MBytes/s] = 1.0E-06*REQUESTS_TO_L2_GRP1_ALL_NO_PF*64.0/time +L2D load data volume [GBytes] = 1.0E-09*REQUESTS_TO_L2_GRP1_ALL_NO_PF*64.0 +L2 bandwidth [MBytes/s] = 1.0E-06*(REQUESTS_TO_L2_GRP1_ALL_NO_PF)*64/time +L2 data volume [GBytes] = 1.0E-09*(REQUESTS_TO_L2_GRP1_ALL_NO_PF)*64 +- +Profiling group to measure L2 cache bandwidth. There is no way to measure +the store traffic between L1 and L2. diff --git a/collectors/likwid/groups/zen/L3.txt b/collectors/likwid/groups/zen/L3.txt new file mode 100644 index 0000000..6fe808a --- /dev/null +++ b/collectors/likwid/groups/zen/L3.txt @@ -0,0 +1,32 @@ +SHORT L3 cache bandwidth in MBytes/s + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +CPMC0 L3_ACCESS +CPMC1 L3_MISS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI PMC1/PMC0 +L3 access bandwidth [MBytes/s] 1.0E-06*CPMC0*64.0/time +L3 access data volume [GBytes] 1.0E-09*CPMC0*64.0 +L3 access rate [%] (CPMC0/PMC0)*100.0 +L3 miss rate [%] (CPMC1/PMC0)*100.0 +L3 miss ratio [%] (CPMC1/CPMC0)*100.0 + +LONG +Formulas: +L3 access bandwidth [MBytes/s] = 1.0E-06*L3_ACCESS*64.0/time +L3 access data volume [GBytes] = 1.0E-09*L3_ACCESS*64.0 +L3 access rate [%] = (L3_ACCESS/RETIRED_INSTRUCTIONS)*100 +L3 miss rate [%] = (L3_MISS/RETIRED_INSTRUCTIONS)*100 +L3 miss ratio [%]= (L3_MISS/L3_ACCESS)*100 +- +Profiling group to measure L3 cache bandwidth. There is no way to measure +the store traffic between L2 and L3. The only two published L3 events are +L3_ACCESS and L3_MISS. diff --git a/collectors/likwid/groups/zen/MEM.txt b/collectors/likwid/groups/zen/MEM.txt new file mode 100644 index 0000000..36ff58f --- /dev/null +++ b/collectors/likwid/groups/zen/MEM.txt @@ -0,0 +1,32 @@ +SHORT Main memory bandwidth in MBytes/s (experimental) + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +DFC0 DATA_FROM_LOCAL_DRAM_CHANNEL +DFC1 DATA_TO_LOCAL_DRAM_CHANNEL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI PMC1/PMC0 +Memory bandwidth [MBytes/s] 1.0E-06*(DFC0+DFC1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(DFC0+DFC1)*64.0 + +LONG +Formulas: +Memory bandwidth [MBytes/s] = 1.0E-06*(DATA_FROM_LOCAL_DRAM_CHANNEL+DATA_TO_LOCAL_DRAM_CHANNEL)*64.0/runtime +Memory data volume [GBytes] = 1.0E-09*(DATA_FROM_LOCAL_DRAM_CHANNEL+DATA_TO_LOCAL_DRAM_CHANNEL)*64.0 +- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Since this group is based on Uncore events it is only possible to measure on a +per socket base. +The group provides almost accurate results for the total bandwidth and data volume. +AMD describes this metric as "approximate" in the documentation for AMD Rome. + +Be aware that despite the events imply a traffic direction (FROM and TO), the events +cannot be used to differentiate between read and write traffic. The events will be +renamed to avoid that confusion in the future. diff --git a/collectors/likwid/groups/zen/MEM_DP.txt b/collectors/likwid/groups/zen/MEM_DP.txt new file mode 100644 index 0000000..9c2fe38 --- /dev/null +++ b/collectors/likwid/groups/zen/MEM_DP.txt @@ -0,0 +1,39 @@ +SHORT Overview of arithmetic and main memory performance + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +PMC2 RETIRED_SSE_AVX_FLOPS_DOUBLE_ALL +PMC3 MERGE +DFC0 DATA_FROM_LOCAL_DRAM_CHANNEL +DFC1 DATA_TO_LOCAL_DRAM_CHANNEL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI PMC1/PMC0 +DP [MFLOP/s] 1.0E-06*(PMC2)/time +Memory bandwidth [MBytes/s] 1.0E-06*(DFC0+DFC1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(DFC0+DFC1)*64.0 +Operational intensity PMC2/((DFC0+DFC1)*64.0) + +LONG +Formulas: +DP [MFLOP/s] = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_DOUBLE_ALL)/time +Memory bandwidth [MBytes/s] = 1.0E-06*(DATA_FROM_LOCAL_DRAM_CHANNEL+DATA_TO_LOCAL_DRAM_CHANNEL)*64.0/runtime +Memory data volume [GBytes] = 1.0E-09*(DATA_FROM_LOCAL_DRAM_CHANNEL+DATA_TO_LOCAL_DRAM_CHANNEL)*64.0 +Operational intensity = RETIRED_SSE_AVX_FLOPS_DOUBLE_ALL/((DATA_FROM_LOCAL_DRAM_CHANNEL+DATA_TO_LOCAL_DRAM_CHANNEL)*64.0) +- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Since this group is based on Uncore events it is only possible to measure on a +per socket base. +The group provides almost accurate results for the total bandwidth and data volume. +AMD describes this metric as "approximate" in the documentation for AMD Rome. + +Be aware that despite the events imply a traffic direction (FROM and TO), the events +cannot be used to differentiate between read and write traffic. The events will be +renamed to avoid that confusion in the future. + diff --git a/collectors/likwid/groups/zen/MEM_SP.txt b/collectors/likwid/groups/zen/MEM_SP.txt new file mode 100644 index 0000000..48ce75c --- /dev/null +++ b/collectors/likwid/groups/zen/MEM_SP.txt @@ -0,0 +1,39 @@ +SHORT Overview of arithmetic and main memory performance + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +PMC2 RETIRED_SSE_AVX_FLOPS_SINGLE_ALL +PMC3 MERGE +DFC0 DATA_FROM_LOCAL_DRAM_CHANNEL +DFC1 DATA_TO_LOCAL_DRAM_CHANNEL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI PMC1/PMC0 +SP [MFLOP/s] 1.0E-06*(PMC2)/time +Memory bandwidth [MBytes/s] 1.0E-06*(DFC0+DFC1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(DFC0+DFC1)*64.0 +Operational intensity PMC2/((DFC0+DFC1)*64.0) + +LONG +Formulas: +DP [MFLOP/s] = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_DOUBLE_ALL)/time +Memory bandwidth [MBytes/s] = 1.0E-06*(DATA_FROM_LOCAL_DRAM_CHANNEL+DATA_TO_LOCAL_DRAM_CHANNEL)*64.0/runtime +Memory data volume [GBytes] = 1.0E-09*(DATA_FROM_LOCAL_DRAM_CHANNEL+DATA_TO_LOCAL_DRAM_CHANNEL)*64.0 +Operational intensity = RETIRED_SSE_AVX_FLOPS_SINGLE_ALL/((DATA_FROM_LOCAL_DRAM_CHANNEL+DATA_TO_LOCAL_DRAM_CHANNEL)*64.0) +- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Since this group is based on Uncore events it is only possible to measure on a +per socket base. +The group provides almost accurate results for the total bandwidth and data volume. +AMD describes this metric as "approximate" in the documentation for AMD Rome. + +Be aware that despite the events imply a traffic direction (FROM and TO), the events +cannot be used to differentiate between read and write traffic. The events will be +renamed to avoid that confusion in the future. + diff --git a/collectors/likwid/groups/zen/NUMA.txt b/collectors/likwid/groups/zen/NUMA.txt new file mode 100644 index 0000000..19ccdc1 --- /dev/null +++ b/collectors/likwid/groups/zen/NUMA.txt @@ -0,0 +1,35 @@ +SHORT L2 cache bandwidth in MBytes/s (experimental) + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 DATA_CACHE_REFILLS_LOCAL_ALL +PMC1 DATA_CACHE_REFILLS_REMOTE_ALL +PMC2 HWPREF_DATA_CACHE_FILLS_LOCAL_ALL +PMC3 HWPREF_DATA_CACHE_FILLS_REMOTE_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI PMC1/PMC0 +Local bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC2)*64.0/time +Local data volume [GBytes] 1.0E-09*(PMC0+PMC2)*64.0 +Remote bandwidth [MBytes/s] 1.0E-06*(PMC1+PMC3)*64.0/time +Remote data volume [GBytes] 1.0E-09*(PMC1+PMC3)*64.0 +Total bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC2+PMC1+PMC3)*64.0/time +Total data volume [GBytes] 1.0E-09*(PMC0+PMC2+PMC1+PMC3)*64.0 + +LONG +Formulas: +Local bandwidth [MBytes/s] = 1.0E-06*(DATA_CACHE_REFILLS_LOCAL_ALL+HWPREF_DATA_CACHE_FILLS_LOCAL_ALL)*64.0/time +Local data volume [GBytes] = 1.0E-09*(DATA_CACHE_REFILLS_LOCAL_ALL+HWPREF_DATA_CACHE_FILLS_LOCAL_ALL)*64.0 +Remote bandwidth [MBytes/s] = 1.0E-06*(DATA_CACHE_REFILLS_REMOTE_ALL+HWPREF_DATA_CACHE_FILLS_REMOTE_ALL)*64.0/time +Remote data volume [GBytes] = 1.0E-09*(DATA_CACHE_REFILLS_REMOTE_ALL+HWPREF_DATA_CACHE_FILLS_REMOTE_ALL)*64.0 +Total bandwidth [MBytes/s] = 1.0E-06*(DATA_CACHE_REFILLS_LOCAL_ALL+HWPREF_DATA_CACHE_FILLS_LOCAL_ALL+DATA_CACHE_REFILLS_REMOTE_ALL+HWPREF_DATA_CACHE_FILLS_REMOTE_ALL)*64.0/time +Total data volume [GBytes] = 1.0E-09*(DATA_CACHE_REFILLS_LOCAL_ALL+HWPREF_DATA_CACHE_FILLS_LOCAL_ALL+DATA_CACHE_REFILLS_REMOTE_ALL+HWPREF_DATA_CACHE_FILLS_REMOTE_ALL)*64.0 +- +Profiling group to measure NUMA traffic. The data sources range from +local L2, CCX and memory for the local metrics and remote CCX and memory +for the remote metrics. There are also events that measure the software +prefetches from local and remote domain but AMD Zen provides only 4 counters. diff --git a/collectors/likwid/groups/zen/TLB.txt b/collectors/likwid/groups/zen/TLB.txt new file mode 100644 index 0000000..510284b --- /dev/null +++ b/collectors/likwid/groups/zen/TLB.txt @@ -0,0 +1,39 @@ +SHORT TLB miss rate/ratio + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 DATA_CACHE_ACCESSES +PMC2 L1_DTLB_MISS_ANY_L2_HIT +PMC3 L1_DTLB_MISS_ANY_L2_MISS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/PMC0 +L1 DTLB request rate PMC1/PMC0 +L1 DTLB miss rate (PMC2+PMC3)/PMC0 +L1 DTLB miss ratio (PMC2+PMC3)/PMC1 +L2 DTLB request rate (PMC2+PMC3)/PMC0 +L2 DTLB miss rate PMC3/PMC0 +L2 DTLB miss ratio PMC3/(PMC2+PMC3) + + +LONG +Formulas: +L1 DTLB request rate = DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS +L1 DTLB miss rate = (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS)/RETIRED_INSTRUCTIONS +L1 DTLB miss ratio = (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS)/DATA_CACHE_ACCESSES +L2 DTLB request rate = (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS)/RETIRED_INSTRUCTIONS +L2 DTLB miss rate = L1_DTLB_MISS_ANY_L2_MISS / RETIRED_INSTRUCTIONS +L2 DTLB miss ratio = L1_DTLB_MISS_ANY_L2_MISS / (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS) +- +L1 DTLB request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The DTLB miss rate gives a measure how often a TLB miss occurred +per instruction. And finally L1 DTLB miss ratio tells you how many +of your memory references required caused a TLB miss on average. +NOTE: The L2 metrics are only relevant if L2 DTLB request rate is +equal to the L1 DTLB miss rate! diff --git a/collectors/likwid/groups/zen2/BRANCH.txt b/collectors/likwid/groups/zen2/BRANCH.txt new file mode 100644 index 0000000..dbaf07f --- /dev/null +++ b/collectors/likwid/groups/zen2/BRANCH.txt @@ -0,0 +1,32 @@ +SHORT Branch prediction miss rate/ratio + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +PMC2 RETIRED_BRANCH_INSTR +PMC3 RETIRED_MISP_BRANCH_INSTR + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI PMC1/PMC0 +Branch rate PMC2/PMC0 +Branch misprediction rate PMC3/PMC0 +Branch misprediction ratio PMC3/PMC2 +Instructions per branch PMC0/PMC2 + +LONG +Formulas: +Branch rate = RETIRED_BRANCH_INSTR/RETIRED_INSTRUCTIONS +Branch misprediction rate = RETIRED_MISP_BRANCH_INSTR/RETIRED_INSTRUCTIONS +Branch misprediction ratio = RETIRED_MISP_BRANCH_INSTR/RETIRED_BRANCH_INSTR +Instructions per branch = RETIRED_INSTRUCTIONS/RETIRED_BRANCH_INSTR +- +The rates state how often on average a branch or a mispredicted branch occurred +per instruction retired in total. The branch misprediction ratio sets directly +into relation what ratio of all branch instruction where mispredicted. +Instructions per branch is 1/branch rate. + diff --git a/collectors/likwid/groups/zen2/CACHE.txt b/collectors/likwid/groups/zen2/CACHE.txt new file mode 100644 index 0000000..b773e5a --- /dev/null +++ b/collectors/likwid/groups/zen2/CACHE.txt @@ -0,0 +1,39 @@ +SHORT Data cache miss rate/ratio + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +PMC2 DATA_CACHE_ACCESSES +PMC3 DATA_CACHE_REFILLS_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI PMC1/PMC0 +data cache requests PMC2 +data cache request rate PMC2/PMC0 +data cache misses PMC3 +data cache miss rate PMC3/PMC0 +data cache miss ratio PMC3/PMC2 + +LONG +Formulas: +data cache requests = DATA_CACHE_ACCESSES +data cache request rate = DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS +data cache misses = DATA_CACHE_REFILLS_ALL +data cache miss rate = DATA_CACHE_REFILLS_ALL / RETIRED_INSTRUCTIONS +data cache miss ratio = DATA_CACHE_REFILLS_ALL / DATA_CACHE_ACCESSES +- +This group measures the locality of your data accesses with regard to the +L1 cache. Data cache request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The data cache miss rate gives a measure how often it was necessary to get +cache lines from higher levels of the memory hierarchy. And finally +data cache miss ratio tells you how many of your memory references required +a cache line to be loaded from a higher level. While the# data cache miss rate +might be given by your algorithm you should try to get data cache miss ratio +as low as possible by increasing your cache reuse. + diff --git a/collectors/likwid/groups/zen2/CPI.txt b/collectors/likwid/groups/zen2/CPI.txt new file mode 100644 index 0000000..23e4f8c --- /dev/null +++ b/collectors/likwid/groups/zen2/CPI.txt @@ -0,0 +1,30 @@ +SHORT Cycles per instruction + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +PMC2 RETIRED_UOPS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] PMC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI PMC1/PMC0 +CPI (based on uops) PMC1/PMC2 +IPC PMC0/PMC1 + + +LONG +Formulas: +CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS +CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS +IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED +- +This group measures how efficient the processor works with +regard to instruction throughput. Also important as a standalone +metric is RETIRED_INSTRUCTIONS as it tells you how many instruction +you need to execute for a task. An optimization might show very +low CPI values but execute many more instruction for it. + diff --git a/collectors/likwid/groups/zen2/DATA.txt b/collectors/likwid/groups/zen2/DATA.txt new file mode 100644 index 0000000..e061b90 --- /dev/null +++ b/collectors/likwid/groups/zen2/DATA.txt @@ -0,0 +1,23 @@ +SHORT Load to store ratio + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +PMC2 LS_DISPATCH_LOADS +PMC3 LS_DISPATCH_STORES + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI PMC1/PMC0 +Load to store ratio PMC2/PMC3 + +LONG +Formulas: +Load to store ratio = LS_DISPATCH_LOADS/LS_DISPATCH_STORES +- +This is a simple metric to determine your load to store ratio. + diff --git a/collectors/likwid/groups/zen2/DIVIDE.txt b/collectors/likwid/groups/zen2/DIVIDE.txt new file mode 100644 index 0000000..13d629b --- /dev/null +++ b/collectors/likwid/groups/zen2/DIVIDE.txt @@ -0,0 +1,25 @@ +SHORT Divide unit information + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +PMC2 DIV_OP_COUNT +PMC3 DIV_BUSY_CYCLES + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI PMC1/PMC0 +Number of divide ops PMC2 +Avg. divide unit usage duration PMC3/PMC2 + +LONG +Formulas: +Number of divide ops = DIV_OP_COUNT +Avg. divide unit usage duration = DIV_BUSY_CYCLES/DIV_OP_COUNT +-- +This performance group measures the average latency of divide operations diff --git a/collectors/likwid/groups/zen2/ENERGY.txt b/collectors/likwid/groups/zen2/ENERGY.txt new file mode 100644 index 0000000..f58c5b1 --- /dev/null +++ b/collectors/likwid/groups/zen2/ENERGY.txt @@ -0,0 +1,32 @@ +SHORT Power and Energy consumption + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +PWR0 RAPL_CORE_ENERGY +PWR1 RAPL_PKG_ENERGY + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI PMC1/PMC0 +Energy Core [J] PWR0 +Power Core [W] PWR0/time +Energy PKG [J] PWR1 +Power PKG [W] PWR1/time + +LONG +Formulas: +Power Core [W] = RAPL_CORE_ENERGY/time +Power PKG [W] = RAPL_PKG_ENERGY/time +- +Ryzen implements the RAPL interface previously introduced by Intel. +This interface enables to monitor the consumed energy on the core and package +domain. +It is not documented by AMD which parts of the CPU are in which domain. + diff --git a/collectors/likwid/groups/zen2/FLOPS_DP.txt b/collectors/likwid/groups/zen2/FLOPS_DP.txt new file mode 100644 index 0000000..740acb9 --- /dev/null +++ b/collectors/likwid/groups/zen2/FLOPS_DP.txt @@ -0,0 +1,28 @@ +SHORT Double Precision MFLOP/s + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +PMC2 RETIRED_SSE_AVX_FLOPS_ALL +PMC3 MERGE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI PMC1/PMC0 +DP [MFLOP/s] 1.0E-06*(PMC2)/time + +LONG +Formulas: +CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS +DP [MFLOP/s] = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_ALL)/time +- +Profiling group to measure (double-precisision) FLOP rate. The event might +have a higher per-cycle increment than 15, so the MERGE event is required. In +contrast to AMD Zen, the Zen2 microarchitecture does not provide events to +differentiate between single- and double-precision. + + diff --git a/collectors/likwid/groups/zen2/FLOPS_SP.txt b/collectors/likwid/groups/zen2/FLOPS_SP.txt new file mode 100644 index 0000000..0d25aeb --- /dev/null +++ b/collectors/likwid/groups/zen2/FLOPS_SP.txt @@ -0,0 +1,28 @@ +SHORT Single Precision MFLOP/s + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +PMC2 RETIRED_SSE_AVX_FLOPS_ALL +PMC3 MERGE + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI PMC1/PMC0 +SP [MFLOP/s] 1.0E-06*(PMC2)/time + +LONG +Formulas: +CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS +SP [MFLOP/s] = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_ALL)/time +- +Profiling group to measure (single-precisision) FLOP rate. The event might +have a higher per-cycle increment than 15, so the MERGE event is required. In +contrast to AMD Zen, the Zen2 microarchitecture does not provide events to +differentiate between single- and double-precision. + + diff --git a/collectors/likwid/groups/zen2/ICACHE.txt b/collectors/likwid/groups/zen2/ICACHE.txt new file mode 100644 index 0000000..f98c28a --- /dev/null +++ b/collectors/likwid/groups/zen2/ICACHE.txt @@ -0,0 +1,28 @@ +SHORT Instruction cache miss rate/ratio + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 ICACHE_FETCHES +PMC2 ICACHE_L2_REFILLS +PMC3 ICACHE_SYSTEM_REFILLS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/PMC0 +L1I request rate PMC1/PMC0 +L1I miss rate (PMC2+PMC3)/PMC0 +L1I miss ratio (PMC2+PMC3)/PMC1 + +LONG +Formulas: +L1I request rate = ICACHE_FETCHES / RETIRED_INSTRUCTIONS +L1I miss rate = (ICACHE_L2_REFILLS + ICACHE_SYSTEM_REFILLS)/RETIRED_INSTRUCTIONS +L1I miss ratio = (ICACHE_L2_REFILLS + ICACHE_SYSTEM_REFILLS)/ICACHE_FETCHES +- +This group measures the locality of your instruction code with regard to the +L1 I-Cache. + diff --git a/collectors/likwid/groups/zen2/L2.txt b/collectors/likwid/groups/zen2/L2.txt new file mode 100644 index 0000000..420e34d --- /dev/null +++ b/collectors/likwid/groups/zen2/L2.txt @@ -0,0 +1,28 @@ +SHORT L2 cache bandwidth in MBytes/s (experimental) + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +PMC3 REQUESTS_TO_L2_GRP1_ALL_NO_PF + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI PMC1/PMC0 +L2D load bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time +L2D load data volume [GBytes] 1.0E-09*PMC3*64.0 +L2 bandwidth [MBytes/s] 1.0E-06*(PMC3)*64.0/time +L2 data volume [GBytes] 1.0E-09*(PMC3)*64.0 + +LONG +Formulas: +L2D load bandwidth [MBytes/s] = 1.0E-06*REQUESTS_TO_L2_GRP1_ALL_NO_PF*64.0/time +L2D load data volume [GBytes] = 1.0E-09*REQUESTS_TO_L2_GRP1_ALL_NO_PF*64.0 +L2 bandwidth [MBytes/s] = 1.0E-06*(REQUESTS_TO_L2_GRP1_ALL_NO_PF)*64/time +L2 data volume [GBytes] = 1.0E-09*(REQUESTS_TO_L2_GRP1_ALL_NO_PF)*64 +- +Profiling group to measure L2 cache bandwidth. There is no way to measure +the store traffic between L1 and L2. diff --git a/collectors/likwid/groups/zen2/L3.txt b/collectors/likwid/groups/zen2/L3.txt new file mode 100644 index 0000000..6fe808a --- /dev/null +++ b/collectors/likwid/groups/zen2/L3.txt @@ -0,0 +1,32 @@ +SHORT L3 cache bandwidth in MBytes/s + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +CPMC0 L3_ACCESS +CPMC1 L3_MISS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI PMC1/PMC0 +L3 access bandwidth [MBytes/s] 1.0E-06*CPMC0*64.0/time +L3 access data volume [GBytes] 1.0E-09*CPMC0*64.0 +L3 access rate [%] (CPMC0/PMC0)*100.0 +L3 miss rate [%] (CPMC1/PMC0)*100.0 +L3 miss ratio [%] (CPMC1/CPMC0)*100.0 + +LONG +Formulas: +L3 access bandwidth [MBytes/s] = 1.0E-06*L3_ACCESS*64.0/time +L3 access data volume [GBytes] = 1.0E-09*L3_ACCESS*64.0 +L3 access rate [%] = (L3_ACCESS/RETIRED_INSTRUCTIONS)*100 +L3 miss rate [%] = (L3_MISS/RETIRED_INSTRUCTIONS)*100 +L3 miss ratio [%]= (L3_MISS/L3_ACCESS)*100 +- +Profiling group to measure L3 cache bandwidth. There is no way to measure +the store traffic between L2 and L3. The only two published L3 events are +L3_ACCESS and L3_MISS. diff --git a/collectors/likwid/groups/zen2/MEM.txt b/collectors/likwid/groups/zen2/MEM.txt new file mode 100644 index 0000000..c589640 --- /dev/null +++ b/collectors/likwid/groups/zen2/MEM.txt @@ -0,0 +1,35 @@ +SHORT Main memory bandwidth in MBytes/s (experimental) + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 CPU_CLOCKS_UNHALTED +DFC0 DATA_FROM_LOCAL_DRAM_CHANNEL +DFC1 DATA_TO_LOCAL_DRAM_CHANNEL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI PMC1/PMC0 +Memory bandwidth [MBytes/s] 1.0E-06*(DFC0+DFC1)*(4.0/num_numadomains)*64.0/time +Memory data volume [GBytes] 1.0E-09*(DFC0+DFC1)*(4.0/num_numadomains)*64.0 + +LONG +Formulas: +Memory bandwidth [MBytes/s] = 4.0E-06*(DATA_FROM_LOCAL_DRAM_CHANNEL+DATA_TO_LOCAL_DRAM_CHANNEL)*(4.0/num_numadomains)*64.0/runtime +Memory data volume [GBytes] = 4.0E-09*(DATA_FROM_LOCAL_DRAM_CHANNEL+DATA_TO_LOCAL_DRAM_CHANNEL)*(4.0/num_numadomains)*64.0 +- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Since this group is based on Uncore events it is only possible to measure on a +per socket base. +The group provides almost accurate results for the total bandwidth +and data volume. +The metric formulas contain a correction factor of (4.0/num_numadomains) to +return the value for all 4 memory controllers in NPS1 mode. This is probably +a work-around. Requested info from AMD but no answer. + +Be aware that despite the events imply a traffic direction (FROM and TO), the events +cannot be used to differentiate between read and write traffic. The events will be +renamed to avoid that confusion in the future. diff --git a/collectors/likwid/groups/zen2/NUMA.txt b/collectors/likwid/groups/zen2/NUMA.txt new file mode 100644 index 0000000..6cb881a --- /dev/null +++ b/collectors/likwid/groups/zen2/NUMA.txt @@ -0,0 +1,35 @@ +SHORT Local and remote memory accesses (experimental) + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 DATA_CACHE_REFILLS_LOCAL_ALL +PMC1 DATA_CACHE_REFILLS_REMOTE_ALL +PMC2 HWPREF_DATA_CACHE_FILLS_LOCAL_ALL +PMC3 HWPREF_DATA_CACHE_FILLS_REMOTE_ALL + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI PMC1/PMC0 +Local bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC2)*64.0/time +Local data volume [GBytes] 1.0E-09*(PMC0+PMC2)*64.0 +Remote bandwidth [MBytes/s] 1.0E-06*(PMC1+PMC3)*64.0/time +Remote data volume [GBytes] 1.0E-09*(PMC1+PMC3)*64.0 +Total bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC2+PMC1+PMC3)*64.0/time +Total data volume [GBytes] 1.0E-09*(PMC0+PMC2+PMC1+PMC3)*64.0 + +LONG +Formulas: +Local bandwidth [MBytes/s] = 1.0E-06*(DATA_CACHE_REFILLS_LOCAL_ALL+HWPREF_DATA_CACHE_FILLS_LOCAL_ALL)*64.0/time +Local data volume [GBytes] = 1.0E-09*(DATA_CACHE_REFILLS_LOCAL_ALL+HWPREF_DATA_CACHE_FILLS_LOCAL_ALL)*64.0 +Remote bandwidth [MBytes/s] = 1.0E-06*(DATA_CACHE_REFILLS_REMOTE_ALL+HWPREF_DATA_CACHE_FILLS_REMOTE_ALL)*64.0/time +Remote data volume [GBytes] = 1.0E-09*(DATA_CACHE_REFILLS_REMOTE_ALL+HWPREF_DATA_CACHE_FILLS_REMOTE_ALL)*64.0 +Total bandwidth [MBytes/s] = 1.0E-06*(DATA_CACHE_REFILLS_LOCAL_ALL+HWPREF_DATA_CACHE_FILLS_LOCAL_ALL+DATA_CACHE_REFILLS_REMOTE_ALL+HWPREF_DATA_CACHE_FILLS_REMOTE_ALL)*64.0/time +Total data volume [GBytes] = 1.0E-09*(DATA_CACHE_REFILLS_LOCAL_ALL+HWPREF_DATA_CACHE_FILLS_LOCAL_ALL+DATA_CACHE_REFILLS_REMOTE_ALL+HWPREF_DATA_CACHE_FILLS_REMOTE_ALL)*64.0 +- +Profiling group to measure NUMA traffic. The data sources range from +local L2, CCX and memory for the local metrics and remote CCX and memory +for the remote metrics. There are also events that measure the software +prefetches from local and remote domain but AMD Zen provides only 4 counters. diff --git a/collectors/likwid/groups/zen2/TLB.txt b/collectors/likwid/groups/zen2/TLB.txt new file mode 100644 index 0000000..510284b --- /dev/null +++ b/collectors/likwid/groups/zen2/TLB.txt @@ -0,0 +1,39 @@ +SHORT TLB miss rate/ratio + +EVENTSET +FIXC1 ACTUAL_CPU_CLOCK +FIXC2 MAX_CPU_CLOCK +PMC0 RETIRED_INSTRUCTIONS +PMC1 DATA_CACHE_ACCESSES +PMC2 L1_DTLB_MISS_ANY_L2_HIT +PMC3 L1_DTLB_MISS_ANY_L2_MISS + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/PMC0 +L1 DTLB request rate PMC1/PMC0 +L1 DTLB miss rate (PMC2+PMC3)/PMC0 +L1 DTLB miss ratio (PMC2+PMC3)/PMC1 +L2 DTLB request rate (PMC2+PMC3)/PMC0 +L2 DTLB miss rate PMC3/PMC0 +L2 DTLB miss ratio PMC3/(PMC2+PMC3) + + +LONG +Formulas: +L1 DTLB request rate = DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS +L1 DTLB miss rate = (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS)/RETIRED_INSTRUCTIONS +L1 DTLB miss ratio = (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS)/DATA_CACHE_ACCESSES +L2 DTLB request rate = (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS)/RETIRED_INSTRUCTIONS +L2 DTLB miss rate = L1_DTLB_MISS_ANY_L2_MISS / RETIRED_INSTRUCTIONS +L2 DTLB miss ratio = L1_DTLB_MISS_ANY_L2_MISS / (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS) +- +L1 DTLB request rate tells you how data intensive your code is +or how many data accesses you have on average per instruction. +The DTLB miss rate gives a measure how often a TLB miss occurred +per instruction. And finally L1 DTLB miss ratio tells you how many +of your memory references required caused a TLB miss on average. +NOTE: The L2 metrics are only relevant if L2 DTLB request rate is +equal to the L1 DTLB miss rate! diff --git a/collectors/likwid/groups/zen3/.empty b/collectors/likwid/groups/zen3/.empty new file mode 100644 index 0000000..5e965d1 --- /dev/null +++ b/collectors/likwid/groups/zen3/.empty @@ -0,0 +1 @@ +There is currently no public documentation for AMD Zen3. This folder is just a placeholder for future performance groups. diff --git a/collectors/likwid/liblikwid-hwloc.a b/collectors/likwid/liblikwid-hwloc.a new file mode 100644 index 0000000..09feadd Binary files /dev/null and b/collectors/likwid/liblikwid-hwloc.a differ diff --git a/collectors/likwid/liblikwid.a b/collectors/likwid/liblikwid.a new file mode 100644 index 0000000..a3e223f Binary files /dev/null and b/collectors/likwid/liblikwid.a differ diff --git a/collectors/likwid/likwid-marker.h b/collectors/likwid/likwid-marker.h new file mode 100644 index 0000000..ebf8b89 --- /dev/null +++ b/collectors/likwid/likwid-marker.h @@ -0,0 +1,170 @@ +/* + * ======================================================================================= + * + * Filename: likwid-marker.h + * + * Description: Header File of likwid Marker API + * + * Version: + * Released: + * + * Authors: Thomas Gruber (tg), thomas.roehl@googlemail.com + * + * Project: likwid + * + * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation, either version 3 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + * ======================================================================================= + */ +#ifndef LIKWID_MARKER_H +#define LIKWID_MARKER_H + + +/** \addtogroup MarkerAPI Marker API module +* @{ +*/ +/*! +\def LIKWID_MARKER_INIT +Shortcut for likwid_markerInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed +*/ +/*! +\def LIKWID_MARKER_THREADINIT +Shortcut for likwid_markerThreadInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed +*/ +/*! +\def LIKWID_MARKER_REGISTER(regionTag) +Shortcut for likwid_markerRegisterRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed +*/ +/*! +\def LIKWID_MARKER_START(regionTag) +Shortcut for likwid_markerStartRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed +*/ +/*! +\def LIKWID_MARKER_STOP(regionTag) +Shortcut for likwid_markerStopRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed +*/ +/*! +\def LIKWID_MARKER_GET(regionTag, nevents, events, time, count) +Shortcut for likwid_markerGetResults() for \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed +*/ +/*! +\def LIKWID_MARKER_SWITCH +Shortcut for likwid_markerNextGroup() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed +*/ +/*! +\def LIKWID_MARKER_RESET(regionTag) +Shortcut for likwid_markerResetRegion() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed +*/ +/*! +\def LIKWID_MARKER_CLOSE +Shortcut for likwid_markerClose() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed +*/ +/** @}*/ + +#ifdef LIKWID_PERFMON +#include +#define LIKWID_MARKER_INIT likwid_markerInit() +#define LIKWID_MARKER_THREADINIT likwid_markerThreadInit() +#define LIKWID_MARKER_SWITCH likwid_markerNextGroup() +#define LIKWID_MARKER_REGISTER(regionTag) likwid_markerRegisterRegion(regionTag) +#define LIKWID_MARKER_START(regionTag) likwid_markerStartRegion(regionTag) +#define LIKWID_MARKER_STOP(regionTag) likwid_markerStopRegion(regionTag) +#define LIKWID_MARKER_CLOSE likwid_markerClose() +#define LIKWID_MARKER_RESET(regionTag) likwid_markerResetRegion(regionTag) +#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count) likwid_markerGetRegion(regionTag, nevents, events, time, count) +#else /* LIKWID_PERFMON */ +#define LIKWID_MARKER_INIT +#define LIKWID_MARKER_THREADINIT +#define LIKWID_MARKER_SWITCH +#define LIKWID_MARKER_REGISTER(regionTag) +#define LIKWID_MARKER_START(regionTag) +#define LIKWID_MARKER_STOP(regionTag) +#define LIKWID_MARKER_CLOSE +#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count) +#define LIKWID_MARKER_RESET(regionTag) +#endif /* LIKWID_PERFMON */ + + +/** \addtogroup NvMarkerAPI NvMarker API module (MarkerAPI for Nvidia GPUs) +* @{ +*/ +/*! +\def LIKWID_NVMARKER_INIT +Shortcut for likwid_gpuMarkerInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed +*/ +/*! +\def LIKWID_NVMARKER_THREADINIT +Shortcut for likwid_gpuMarkerThreadInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed +*/ +/*! +\def LIKWID_NVMARKER_REGISTER(regionTag) +Shortcut for likwid_gpuMarkerRegisterRegion() with \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed +*/ +/*! +\def LIKWID_NVMARKER_START(regionTag) +Shortcut for likwid_gpuMarkerStartRegion() with \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed +*/ +/*! +\def LIKWID_NVMARKER_STOP(regionTag) +Shortcut for likwid_gpuMarkerStopRegion() with \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed +*/ +/*! +\def LIKWID_NVMARKER_GET(regionTag, ngpus, nevents, events, time, count) +Shortcut for likwid_gpuMarkerGetRegion() for \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed +*/ +/*! +\def LIKWID_NVMARKER_SWITCH +Shortcut for likwid_gpuMarkerNextGroup() if compiled with -DLIKWID_NVMON. Otherwise no operation is performed +*/ +/*! +\def LIKWID_NVMARKER_RESET(regionTag) +Shortcut for likwid_gpuMarkerResetRegion() if compiled with -DLIKWID_NVMON. Otherwise no operation is performed +*/ +/*! +\def LIKWID_NVMARKER_CLOSE +Shortcut for likwid_gpuMarkerClose() if compiled with -DLIKWID_NVMON. Otherwise no operation is performed +*/ +/** @}*/ + +#ifdef LIKWID_NVMON +#ifndef LIKWID_WITH_NVMON +#define LIKWID_WITH_NVMON +#endif +#include +#define LIKWID_NVMARKER_INIT likwid_gpuMarkerInit() +#define LIKWID_NVMARKER_THREADINIT likwid_gpuMarkerThreadInit() +#define LIKWID_NVMARKER_SWITCH likwid_gpuMarkerNextGroup() +#define LIKWID_NVMARKER_REGISTER(regionTag) likwid_gpuMarkerRegisterRegion(regionTag) +#define LIKWID_NVMARKER_START(regionTag) likwid_gpuMarkerStartRegion(regionTag) +#define LIKWID_NVMARKER_STOP(regionTag) likwid_gpuMarkerStopRegion(regionTag) +#define LIKWID_NVMARKER_CLOSE likwid_gpuMarkerClose() +#define LIKWID_NVMARKER_RESET(regionTag) likwid_gpuMarkerResetRegion(regionTag) +#define LIKWID_NVMARKER_GET(regionTag, ngpus, nevents, events, time, count) \ + likwid_gpuMarkerGetRegion(regionTag, ngpus, nevents, events, time, count) +#else /* LIKWID_NVMON */ +#define LIKWID_NVMARKER_INIT +#define LIKWID_NVMARKER_THREADINIT +#define LIKWID_NVMARKER_SWITCH +#define LIKWID_NVMARKER_REGISTER(regionTag) +#define LIKWID_NVMARKER_START(regionTag) +#define LIKWID_NVMARKER_STOP(regionTag) +#define LIKWID_NVMARKER_CLOSE +#define LIKWID_NVMARKER_GET(regionTag, nevents, events, time, count) +#define LIKWID_NVMARKER_RESET(regionTag) +#endif /* LIKWID_NVMON */ + + + +#endif /* LIKWID_MARKER_H */ diff --git a/collectors/likwid/likwid.h b/collectors/likwid/likwid.h new file mode 100644 index 0000000..e48e6d7 --- /dev/null +++ b/collectors/likwid/likwid.h @@ -0,0 +1,2305 @@ +/* + * ======================================================================================= + * + * Filename: likwid.h + * + * Description: Header File of likwid API + * + * Version: + * Released: + * + * Authors: Thomas Gruber (tr), thomas.roehl@googlemail.com + * + * Project: likwid + * + * Copyright (C) 2016 RRZE, University Erlangen-Nuremberg + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free Software + * Foundation, either version 3 of the License, or (at your option) any later + * version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + * ======================================================================================= + */ +#ifndef LIKWID_H +#define LIKWID_H + +#include +#include +#include + +#include + +#define DEBUGLEV_ONLY_ERROR 0 +#define DEBUGLEV_INFO 1 +#define DEBUGLEV_DETAIL 2 +#define DEBUGLEV_DEVELOP 3 + +#define LIKWID_VERSION "VERSION.RELEASE.MINORVERSION" +#define LIKWID_COMMIT GITCOMMIT + +extern int perfmon_verbosity; +extern int likwid_nvmon_verbosity; + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef LIKWID_MARKER_INIT +#include +#endif + +/* +################################################################################ +# Marker API related functions +################################################################################ +*/ +/** \addtogroup MarkerAPI Marker API module +* @{ +*/ +/*! \brief Initialize LIKWID's marker API + +Must be called in serial region of the application to set up basic data structures +of LIKWID. +Reads environment variables: +- LIKWID_MODE (access mode) +- LIKWID_MASK (event bitmask) +- LIKWID_EVENTS (event string) +- LIKWID_THREADS (cpu list separated by ,) +- LIKWID_GROUPS (amount of groups) +*/ +extern void likwid_markerInit(void) __attribute__ ((visibility ("default") )); +/*! \brief Initialize LIKWID's marker API for the current thread + +Must be called in parallel region of the application to set up basic data structures +of LIKWID. Before you can call likwid_markerThreadInit() you have to call likwid_markerInit(). + +*/ +extern void likwid_markerThreadInit(void) __attribute__ ((visibility ("default") )); +/*! \brief Switch to next group to measure + +Should be called in a serial region of code. If it is to be called from inside +a parallel region, ensure only one thread runs it by using "#pragma omp single" +or similar. Additionally, if this function is called in a parallel region, +ensure that the serial regions is preceeded by a barrier ("#pragma omp barrier" +or similar) to prevent race conditions. +*/ +extern void likwid_markerNextGroup(void) __attribute__ ((visibility ("default") )); +/*! \brief Close LIKWID's marker API + +Must be called in serial region of the application. It gathers all data of regions and +writes them out to a file (filepath in env variable LIKWID_FILEPATH). +*/ +extern void likwid_markerClose(void) __attribute__ ((visibility ("default") )); +/*! \brief Register a measurement region + +Initializes the hashTable entry in order to reduce execution time of likwid_markerStartRegion() +@param regionTag [in] Initialize data using this string +@return Error code +*/ +extern int likwid_markerRegisterRegion(const char* regionTag) __attribute__ ((visibility ("default") )); +/*! \brief Start a measurement region + +Reads the values of all configured counters and saves the results under the +name given in regionTag. Must be called on every thread that is to be measured, +e.g. if the code to be measured is run in a parallel region, this function must +also be called in a parallel region (typically the same parallel region as the +measured code). If this function is to be called multiple times in one parallel +region, place a barrier ("#pragma omp barrier" or similar) before each call to +likwid_markerStartRegion +@param regionTag [in] Store data using this string +@return Error code of start operation +*/ +extern int likwid_markerStartRegion(const char* regionTag) __attribute__ ((visibility ("default") )); +/*! \brief Stop a measurement region + +Reads the values of all configured counters and saves the results under the +name given in regionTag. The measurement data of the stopped region gets summed +up in global region counters. Must be called on every thread that is to be +measured, e.g. if the code to be measured is run in a parallel region, this +function must also be called in a parallel region (typically the same parallel +region as the measured code). If this function is called multiple times in one +parallel region, place a barrier ("#pragma omp barrier" or similar) after each +call to likwid_markerStopRegion +@param regionTag [in] Store data using this string +@return Error code of stop operation +*/ +extern int likwid_markerStopRegion(const char* regionTag) __attribute__ ((visibility ("default") )); +/*! \brief Reset a measurement region + +Reset the values of all configured counters and timers. +@param regionTag [in] Reset data using this string +@return Error code of reset operation +*/ +extern int likwid_markerResetRegion(const char* regionTag) __attribute__ ((visibility ("default") )); +/*! \brief Get accumulated data of a code region + +Get the accumulated data of the current thread for the given regionTag. +@param regionTag [in] Print data using this string +@param nr_events [in,out] Length of events array +@param events [out] Events array for the intermediate results +@param time [out] Accumulated measurement time +@param count [out] Call count of the code region +*/ +extern void likwid_markerGetRegion(const char* regionTag, int* nr_events, double* events, double *time, int *count) __attribute__ ((visibility ("default") )); +/* utility routines */ +/*! \brief Get CPU ID of the current process/thread + +Returns the ID of the CPU the current process or thread is running on. +@return current CPU ID +*/ +extern int likwid_getProcessorId() __attribute__ ((visibility ("default") )); +/*! \brief Pin the current process to given CPU + +Pin the current process to the given CPU ID. The process cannot be scheduled to +another CPU after pinning but the pinning can be changed anytime with this function. +@param [in] processorId CPU ID to pin the current process to +@return error code (1 for success, 0 for error) +*/ +extern int likwid_pinProcess(int processorId) __attribute__ ((visibility ("default") )); +/*! \brief Pin the current thread to given CPU + +Pin the current thread to the given CPU ID. The thread cannot be scheduled to +another CPU after pinning but the pinning can be changed anytime with this function +@param [in] processorId CPU ID to pin the current thread to +@return error code (1 for success, 0 for error) +*/ +extern int likwid_pinThread(int processorId) __attribute__ ((visibility ("default") )); +/** @}*/ + +/* +################################################################################ +# Access client related functions +################################################################################ +*/ +/** \addtogroup Access Access module + * @{ + */ + +/*! \brief Enum for the access modes + +LIKWID supports multiple access modes to the MSR and PCI performance monitoring +registers. For direct access the user must have enough priviledges to access the +MSR and PCI devices. The daemon mode forwards the operations to a daemon with +higher priviledges. +*/ +typedef enum { + ACCESSMODE_PERF = -1, /*!< \brief Access performance monitoring through perf_event kernel interface */ + ACCESSMODE_DIRECT = 0, /*!< \brief Access performance monitoring registers directly */ + ACCESSMODE_DAEMON = 1 /*!< \brief Use the access daemon to access the registers */ +} AccessMode; + +/*! \brief Set access mode + +Sets the mode how the MSR and PCI registers should be accessed. 0 for direct access (propably root priviledges required) and 1 for accesses through the access daemon. It must be called before HPMinit() +@param [in] mode (0=direct, 1=daemon) +*/ +extern void HPMmode(int mode) __attribute__ ((visibility ("default") )); +/*! \brief Initialize access module + +Initialize the module internals to either the MSR/PCI files or the access daemon +@return error code (0 for access) +*/ +extern int HPMinit() __attribute__ ((visibility ("default") )); +/*! \brief Add CPU to access module + +Add the given CPU to the access module. This opens the commnunication to either the MSR/PCI files or the access daemon. +@param [in] cpu_id CPU that should be enabled for measurements +@return error code (0 for success, -ENODEV if access cannot be initialized +*/ +extern int HPMaddThread(int cpu_id) __attribute__ ((visibility ("default") )); +/*! \brief Close connections + +Close the connections to the MSR/PCI files or the access daemon +*/ +extern void HPMfinalize() __attribute__ ((visibility ("default") )); +/** @}*/ + +/* +################################################################################ +# Config file related functions +################################################################################ +*/ +/** \addtogroup Config Config file module +* @{ +*/ +/*! \brief Structure holding values of the configuration file + +LIKWID supports the definition of runtime values in a configuration file. The +most important configurations in most cases are the path the access daemon and +the corresponding access mode. In order to avoid reading in the system topology +at each start, a path to a topology file can be set. The other values are mostly +used internally. +*/ +typedef struct { + char* configFileName; /*!< \brief Path to the configuration file */ + char* topologyCfgFileName; /*!< \brief Path to the topology file */ + char* daemonPath; /*!< \brief Path of the access daemon */ + char* groupPath; /*!< \brief Path of default performance group directory */ + AccessMode daemonMode; /*!< \brief Access mode to the MSR and PCI registers */ + int maxNumThreads; /*!< \brief Maximum number of HW threads */ + int maxNumNodes; /*!< \brief Maximum number of NUMA nodes */ +} Likwid_Configuration; + +/** \brief Pointer for exporting the Configuration data structure */ +typedef Likwid_Configuration* Configuration_t; +/*! \brief Read the config file of LIKWID, if it exists + +Search for LIKWID config file and read the values in +Currently the paths /usr/local/etc/likwid.cfg, /etc/likwid.cfg and the path +defined in config.mk are checked. +@return error code (0 for success, -EFAULT if no file can be found) +*/ +extern int init_configuration(void) __attribute__ ((visibility ("default") )); +/*! \brief Destroy the config structure + +Destroys the current config structure and frees all allocated memory for path names +@return error code (0 for success, -EFAULT if config structure not initialized) +*/ +extern int destroy_configuration(void) __attribute__ ((visibility ("default") )); + + +/*! \brief Retrieve the config structure + +Get the initialized configuration +\sa Configuration_t +@return Configuration_t (pointer to internal Configuration structure) +*/ +extern Configuration_t get_configuration(void) __attribute__ ((visibility ("default") )); + +/*! \brief Set group path in the config struction + +Set group path in the config struction. The path must be a directory. +@param [in] path +@return error code (0 for success, -ENOMEM if reallocation failed, -ENOTDIR if no directoy) +*/ +extern int config_setGroupPath(const char* path) __attribute__ ((visibility ("default") )); + +/** @}*/ +/* +################################################################################ +# CPU topology related functions +################################################################################ +*/ +/** \addtogroup CPUTopology CPU information module +* @{ +*/ +/*! \brief Structure with general CPU information + +General information covers CPU family, model, name and current clock and vendor +specific information like the version of Intel's performance monitoring facility. +*/ +typedef struct { + uint32_t family; /*!< \brief CPU family ID*/ + uint32_t model; /*!< \brief CPU model ID */ + uint32_t stepping; /*!< \brief Stepping (version) of the CPU */ + uint32_t vendor; /*!< \brief Vendor of the CPU */ + uint32_t part; /*!< \brief Part number of the CPU */ + uint64_t clock; /*!< \brief Current clock frequency of the executing CPU*/ + int turbo; /*!< \brief Flag if CPU has a turbo mode */ + char* osname; /*!< \brief Name of the CPU reported by OS */ + char* name; /*!< \brief Name of the CPU as identified by LIKWID */ + char* short_name; /*!< \brief Short name of the CPU*/ + char* features; /*!< \brief String with all features supported by the CPU*/ + int isIntel; /*!< \brief Flag if it is an Intel CPU*/ + char architecture[20]; /*!< \brief name of the architecture like x86_64 or ppc64 (comparable with uname -m)*/ + int supportUncore; /*!< \brief Flag if system has Uncore performance monitors */ + int supportClientmem; /*!< \brief Flag if system has mappable memory controllers */ + uint64_t featureFlags; /*!< \brief Mask of all features supported by the CPU*/ + uint32_t perf_version; /*!< \brief Version of Intel's performance monitoring facility */ + uint32_t perf_num_ctr; /*!< \brief Number of general purpose HWthread-local performance monitoring counters */ + uint32_t perf_width_ctr; /*!< \brief Bit width of fixed and general purpose counters */ + uint32_t perf_num_fixed_ctr; /*!< \brief Number of fixed purpose HWthread-local performance monitoring counters */ +} CpuInfo; + +/*! \brief Structure with IDs of a HW thread + +For each HW thread this structure stores the ID of the thread inside a CPU, the +CPU core ID of the HW thread and the CPU socket ID. +\extends CpuTopology +*/ +typedef struct { + uint32_t threadId; /*!< \brief ID of HW thread inside the CPU core */ + uint32_t coreId; /*!< \brief ID of CPU core that executes the HW thread */ + uint32_t packageId; /*!< \brief ID of CPU socket containing the HW thread */ + uint32_t apicId; /*!< \brief ID of HW thread retrieved through the Advanced Programmable Interrupt Controller */ + uint32_t inCpuSet; /*!< \brief Flag if HW thread is inside the CPUset */ +} HWThread; + +/*! \brief Enum of possible caches + +CPU caches can have different tasks and hold different kind of data. This enum lists all shapes used in all supported CPUs +\extends CacheLevel +*/ +typedef enum { + NOCACHE=0, /*!< \brief No cache used as undef value */ + DATACACHE, /*!< \brief Cache holding data cache lines */ + INSTRUCTIONCACHE, /*!< \brief Cache holding instruction cache lines */ + UNIFIEDCACHE, /*!< \brief Cache holding both instruction and data cache lines */ + ITLB, /*!< \brief Translation Lookaside Buffer cache for instruction pages */ + DTLB /*!< \brief Translation Lookaside Buffer cache for data pages */ +} CacheType; + +/*! \brief Structure describing a cache level + +CPUs are connected to a cache hierarchy with different amount of caches at each level. The CacheLevel structure holds general information about the cache. +\extends CpuTopology +*/ +typedef struct { + uint32_t level; /*!< \brief Level of the cache in the hierarchy */ + CacheType type; /*!< \brief Type of the cache */ + uint32_t associativity; /*!< \brief Amount of cache lines hold by each set */ + uint32_t sets; /*!< \brief Amount of sets */ + uint32_t lineSize; /*!< \brief Size in bytes of one cache line */ + uint32_t size; /*!< \brief Size in bytes of the cache */ + uint32_t threads; /*!< \brief Number of HW thread connected to the cache */ + uint32_t inclusive; /*!< \brief Flag if cache is inclusive (holds also cache lines available in caches nearer to the CPU) or exclusive */ +} CacheLevel; + +/*! \brief Structure describing the topology of the HW threads in the system + +This structure describes the topology at HW thread level like the amount of HW threads, how they are distributed over the CPU sockets/packages and how the caching hierarchy is assembled. +*/ +typedef struct { + uint32_t numHWThreads; /*!< \brief Amount of active HW threads in the system (e.g. in cpuset) */ + uint32_t activeHWThreads; /*!< \brief Amount of HW threads in the system and length of \a threadPool */ + uint32_t numSockets; /*!< \brief Amount of CPU sockets/packages in the system */ + uint32_t numCoresPerSocket; /*!< \brief Amount of physical cores in one CPU socket/package */ + uint32_t numThreadsPerCore; /*!< \brief Amount of HW threads in one physical CPU core */ + uint32_t numCacheLevels; /*!< \brief Amount of caches for each HW thread and length of \a cacheLevels */ + HWThread* threadPool; /*!< \brief List of all HW thread descriptions */ + CacheLevel* cacheLevels; /*!< \brief List of all caches in the hierarchy */ + struct treeNode* topologyTree; /*!< \brief Anchor for a tree structure describing the system topology */ +} CpuTopology; + +/*! \brief Variable holding the global cpu information structure */ +extern CpuInfo cpuid_info; +/*! \brief Variable holding the global cpu topology structure */ +extern CpuTopology cpuid_topology; + +/** \brief Pointer for exporting the CpuInfo data structure */ +typedef CpuInfo* CpuInfo_t; +/** \brief Pointer for exporting the CpuTopology data structure */ +typedef CpuTopology* CpuTopology_t; +/*! \brief Initialize topology information + +CpuInfo_t and CpuTopology_t are initialized by either HWLOC, CPUID/ProcFS or topology file if present. The topology file name can be configured in the configuration file. Furthermore, the paths /etc/likwid_topo.cfg and <PREFIX>/etc/likwid_topo.cfg are checked. +\sa CpuInfo_t and CpuTopology_t +@return always 0 +*/ +extern int topology_init(void) __attribute__ ((visibility ("default") )); +/*! \brief Retrieve CPU topology of the current machine + +\sa CpuTopology_t +@return CpuTopology_t (pointer to internal cpuid_topology structure) +*/ +extern CpuTopology_t get_cpuTopology(void) __attribute__ ((visibility ("default") )); +/*! \brief Retrieve CPU information of the current machine + +Get the previously initialized CPU info structure containing number of CPUs/Threads +\sa CpuInfo_t +@return CpuInfo_t (pointer to internal cpuid_info structure) +*/ +extern CpuInfo_t get_cpuInfo(void) __attribute__ ((visibility ("default") )); +/*! \brief Destroy topology structures CpuInfo_t and CpuTopology_t. + +Retrieved pointers to the structures are not valid anymore after this function call +\sa CpuInfo_t and CpuTopology_t +*/ +extern void topology_finalize(void) __attribute__ ((visibility ("default") )); +/*! \brief Print all supported architectures +*/ +extern void print_supportedCPUs(void) __attribute__ ((visibility ("default") )); +/** @}*/ +/* +################################################################################ +# NUMA related functions +################################################################################ +*/ +/** \addtogroup NumaTopology NUMA memory topology module + * @{ + */ +/*! \brief CPUs in NUMA node and general information about a NUMA domain + +The NumaNode structure describes the topology and holds general information of a +NUMA node. The structure is filled by calling numa_init() by either the HWLOC +library or by evaluating the /proc filesystem. +\extends NumaTopology +*/ +typedef struct { + uint32_t id; /*!< \brief ID of the NUMA node */ + uint64_t totalMemory; /*!< \brief Amount of memory in the NUMA node */ + uint64_t freeMemory; /*!< \brief Amount of free memory in the NUMA node */ + uint32_t numberOfProcessors; /*!< \brief umber of processors covered by the NUMA node and length of \a processors */ + uint32_t* processors; /*!< \brief List of HW threads in the NUMA node */ + uint32_t numberOfDistances; /*!< \brief Amount of distances to the other NUMA nodes in the system and self */ + uint32_t* distances; /*!< \brief List of distances to the other NUMA nodes and self */ +} NumaNode; + + +/*! \brief The NumaTopology structure describes all NUMA nodes in the current system. +*/ +typedef struct { + uint32_t numberOfNodes; /*!< \brief Number of NUMA nodes in the system and length of \a nodes */ + NumaNode* nodes; /*!< \brief List of NUMA nodes */ +} NumaTopology; + +/*! \brief Variable holding the global NUMA information structure */ +extern NumaTopology numa_info; + +/** \brief Pointer for exporting the NumaTopology data structure */ +typedef NumaTopology* NumaTopology_t; + +/*! \brief Initialize NUMA information + +Initialize NUMA information NumaTopology_t using either HWLOC or CPUID/ProcFS. If +a topology config file is present it is read at topology_init() and fills \a NumaTopology_t +\sa NumaTopology_t +@return error code (0 for success, -1 if initialization failed) +*/ +extern int numa_init(void) __attribute__ ((visibility ("default") )); +/*! \brief Retrieve NUMA information of the current machine + +Get the previously initialized NUMA info structure +\sa NumaTopology_t +@return NumaTopology_t (pointer to internal numa_info structure) +*/ +extern NumaTopology_t get_numaTopology(void) __attribute__ ((visibility ("default") )); +/*! \brief Set memory allocation policy to interleaved + +Set the memory allocation policy to interleaved for given list of CPUs +@param [in] processorList List of processors +@param [in] numberOfProcessors Length of processor list +*/ +extern void numa_setInterleaved(const int* processorList, int numberOfProcessors) __attribute__ ((visibility ("default") )); +/*! \brief Allocate memory from a specific specific NUMA node +@param [in,out] ptr Start pointer of memory +@param [in] size Size for the allocation +@param [in] domainId ID of NUMA node for the allocation +*/ +extern void numa_membind(void* ptr, size_t size, int domainId) __attribute__ ((visibility ("default") )); +/*! \brief Set memory allocation policy to membind + +Set the memory allocation policy to membind for given list of CPUs. This forces +allocation to be placed in NUMA domains spanning the given processor list. +@param [in] processorList List of processors +@param [in] numberOfProcessors Length of processor list +*/ +extern void numa_setMembind(const int* processorList, int numberOfProcessors) __attribute__ ((visibility ("default") )); +/*! \brief Destroy NUMA information structure + +Destroys the NUMA information structure NumaTopology_t. Retrieved pointers +to the structures are not valid anymore after this function call +\sa NumaTopology_t +*/ +extern void numa_finalize(void) __attribute__ ((visibility ("default") )); +/*! \brief Retrieve the number of NUMA nodes + +Returns the number of NUMA nodes of the current machine. Can also be read out of +NumaTopology_t +\sa NumaTopology_t +@return Number of NUMA nodes +*/ +extern int likwid_getNumberOfNodes(void) __attribute__ ((visibility ("default") )); +/** @}*/ +/* +################################################################################ +# Affinity domains related functions +################################################################################ +*/ +/** \addtogroup AffinityDomains Thread affinity module + * @{ + */ + +/*! \brief The AffinityDomain data structure describes a single domain in the current system + +The AffinityDomain data structure describes a single domain in the current system. Example domains are NUMA nodes, CPU sockets/packages or LLC (Last Level Cache) cache domains. +\extends AffinityDomains +*/ +typedef struct { + bstring tag; /*!< \brief Bstring with the ID for the affinity domain. Currently possible values: N (node), SX (socket/package X), CX (LLC cache domain X) and MX (memory domain X) */ + uint32_t numberOfProcessors; /*!< \brief Number of HW threads in the domain and length of \a processorList */ + uint32_t numberOfCores; /*!< \brief Number of hardware threads in the domain */ + int* processorList; /*!< \brief List of HW thread IDs in the domain */ +} AffinityDomain; + +/*! \brief The AffinityDomains data structure holds different count variables describing the +various system layers + +Affinity domains are for example the amount of NUMA domains, CPU sockets/packages or LLC +(Last Level Cache) cache domains of the current machine. Moreover a list of +\a domains holds the processor lists for each domain that are used for +scheduling processes to domain specific HW threads. Some amounts are duplicates +or derivation of values in \a CpuInfo, \a CpuTopology and \a NumaTopology. +*/ +typedef struct { + uint32_t numberOfSocketDomains; /*!< \brief Number of CPU sockets/packages in the system */ + uint32_t numberOfNumaDomains; /*!< \brief Number of NUMA nodes in the system */ + uint32_t numberOfProcessorsPerSocket; /*!< \brief Number of HW threads per socket/package in the system */ + uint32_t numberOfCacheDomains; /*!< \brief Number of LLC caches in the system */ + uint32_t numberOfCoresPerCache; /*!< \brief Number of CPU cores per LLC cache in the system */ + uint32_t numberOfProcessorsPerCache; /*!< \brief Number of hardware threads per LLC cache in the system */ + uint32_t numberOfAffinityDomains; /*!< \brief Number of affinity domains in the current system and length of \a domains array */ + AffinityDomain* domains; /*!< \brief List of all domains in the system */ +} AffinityDomains; + +/** \brief Pointer for exporting the AffinityDomains data structure */ +typedef AffinityDomains* AffinityDomains_t; + +/*! \brief Initialize affinity information + +Initialize affinity information AffinityDomains_t using the data of the structures +\a CpuInfo_t, CpuTopology_t and NumaTopology_t +\sa AffinityDomains_t +*/ +extern void affinity_init() __attribute__ ((visibility ("default") )); +/*! \brief Retrieve affinity structure + +Get the previously initialized affinity info structure +\sa AffinityDomains_t +@return AffinityDomains_t (pointer to internal affinityDomains structure) +*/ +extern AffinityDomains_t get_affinityDomains(void) __attribute__ ((visibility ("default") )); +/*! \brief Pin process to a CPU + +Pin process to a CPU. Duplicate of likwid_pinProcess() +@param [in] processorId CPU ID for pinning +*/ +extern void affinity_pinProcess(int processorId) __attribute__ ((visibility ("default") )); +/*! \brief Pin processes to a CPU + +Pin processes to a CPU. Creates a cpuset with the given processor IDs +@param [in] cpu_count Number of processors in processorIds +@param [in] processorIds Array of processor IDs +*/ +extern void affinity_pinProcesses(int cpu_count, const int* processorIds) __attribute__ ((visibility ("default") )); +/*! \brief Pin thread to a CPU + +Pin thread to a CPU. Duplicate of likwid_pinThread() +@param [in] processorId CPU ID for pinning +*/ +extern void affinity_pinThread(int processorId) __attribute__ ((visibility ("default") )); +/*! \brief Return the CPU ID where the current process runs. + +@return CPU ID +*/ +extern int affinity_processGetProcessorId() __attribute__ ((visibility ("default") )); +/*! \brief Return the CPU ID where the current thread runs. + +@return CPU ID +*/ +extern int affinity_threadGetProcessorId() __attribute__ ((visibility ("default") )); +/*! \brief Destroy affinity information structure + +Destroys the affinity information structure AffinityDomains_t. Retrieved pointers +to the structures are not valid anymore after this function call +\sa AffinityDomains_t +*/ +extern void affinity_finalize() __attribute__ ((visibility ("default") )); +/** @}*/ + +/* +################################################################################ +# CPU string parsing related functions +################################################################################ +*/ +/** \addtogroup CPUParse CPU string parser module + * @{ + */ + +/*! \brief Read CPU selection string and resolve to available CPU numbers + +Reads the CPU selection string and fills the given list with the CPU numbers +defined in the selection string. This function is a interface function for the +different selection modes: scatter, expression, logical and physical. +@param [in] cpustring Selection string +@param [in,out] cpulist List of CPUs +@param [in] length Length of cpulist +@return error code (>0 on success for the returned list length, -ERRORCODE on failure) +*/ +extern int cpustr_to_cpulist(const char* cpustring, int* cpulist, int length) __attribute__ ((visibility ("default") )); +/*! \brief Read NUMA node selection string and resolve to available NUMA node numbers + +Reads the NUMA node selection string and fills the given list with the NUMA node numbers +defined in the selection string. +@param [in] nodestr Selection string +@param [out] nodes List of available NUMA nodes +@param [in] length Length of NUMA node list +@return error code (>0 on success for the returned list length, -ERRORCODE on failure) +*/ +extern int nodestr_to_nodelist(const char* nodestr, int* nodes, int length) __attribute__ ((visibility ("default") )); +/*! \brief Read CPU socket selection string and resolve to available CPU socket numbers + +Reads the CPU socket selection string and fills the given list with the CPU socket numbers +defined in the selection string. +@param [in] sockstr Selection string +@param [out] sockets List of available CPU sockets +@param [in] length Length of CPU socket list +@return error code (>0 on success for the returned list length, -ERRORCODE on failure) +*/ +extern int sockstr_to_socklist(const char* sockstr, int* sockets, int length) __attribute__ ((visibility ("default") )); + +#ifdef LIKWID_WITH_NVMON +/*! \brief Read GPU selection string and resolve to available GPUs numbers + +Reads the GPU selection string and fills the given list with the GPU numbers defined in the selection string. +@param [in] gpustr Selection string +@param [out] gpulist List of available GPU +@param [in] length Length of GPU list +@return error code (>0 on success for the returned list length, -ERRORCODE on failure) +*/ +extern int gpustr_to_gpulist(const char* gpustr, int* gpulist, int length) __attribute__ ((visibility ("default") )); + +#endif /* LIKWID_WITH_NVMON */ + +/** @}*/ + +/* +################################################################################ +# Performance monitoring related functions +################################################################################ +*/ +/** \addtogroup PerfMon Performance monitoring module + * @{ + */ + +/*! \brief Get all groups + +Checks the configured performance group path for the current architecture and +returns all found group names +@return Amount of found performance groups +*/ +extern int perfmon_getGroups(char*** groups, char*** shortinfos, char*** longinfos) __attribute__ ((visibility ("default") )); + +/*! \brief Free all group information + +@param [in] nrgroups Number of groups +@param [in] groups List of group names +@param [in] shortinfos List of short information string about group +@param [in] longinfos List of long information string about group +*/ +extern void perfmon_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos) __attribute__ ((visibility ("default") )); + +/*! \brief Initialize performance monitoring facility + +Initialize the performance monitoring feature by creating basic data structures. +The CPU ids for the threadsToCpu list can be found in cpuTopology->threadPool[thread_id]->apicId. +The access mode must already be set when calling perfmon_init(). +\sa HPMmode() function and CpuTopology structure with HWThread list + +@param [in] nrThreads Amount of threads +@param [in] threadsToCpu List of CPUs +@return error code (0 on success, -ERRORCODE on failure) +*/ +extern int perfmon_init(int nrThreads, const int* threadsToCpu) __attribute__ ((visibility ("default") )); + +/*! \brief Initialize performance monitoring maps + +Initialize the performance monitoring maps for counters, events and Uncore boxes +for the current architecture. topology_init() and numa_init() must be called before calling +perfmon_init_maps() +\sa RegisterMap list, PerfmonEvent list and BoxMap list +*/ +extern void perfmon_init_maps(void) __attribute__ ((visibility ("default") )); +/*! \brief Check the performance monitoring maps whether counters and events are available + +Checks each counter and event in the performance monitoring maps for their availibility on +the current system. topology_init(), numa_init() and perfmon_init_maps() must be called before calling +perfmon_check_counter_map(). +\sa RegisterMap list, PerfmonEvent list and BoxMap list +*/ +extern void perfmon_check_counter_map(int cpu_id) __attribute__ ((visibility ("default") )); +/*! \brief Add an event string to LIKWID + +A event string looks like Eventname:Countername(:Option1:Option2:...),... +The eventname, countername and options are checked if they are available. +@param [in] eventCString Event string +@return Returns the ID of the new eventSet +*/ +extern int perfmon_addEventSet(const char* eventCString) __attribute__ ((visibility ("default") )); +/*! \brief Setup all performance monitoring counters of an eventSet + +@param [in] groupId (returned from perfmon_addEventSet() +@return error code (-ENOENT if groupId is invalid and -1 if the counters of one CPU cannot be set up) +*/ +extern int perfmon_setupCounters(int groupId) __attribute__ ((visibility ("default") )); +/*! \brief Start performance monitoring counters + +Start the counters that have been previously set up by perfmon_setupCounters(). +The counter registered are zeroed before enabling the counters +@return 0 on success and -(thread_id+1) for error +*/ +extern int perfmon_startCounters(void) __attribute__ ((visibility ("default") )); +/*! \brief Stop performance monitoring counters + +Stop the counters that have been previously started by perfmon_startCounters(). +This function reads the counters, so afterwards the results are availble through +perfmon_getResult, perfmon_getLastResult, perfmon_getMetric and perfmon_getLastMetric. +@return 0 on success and -(thread_id+1) for error +*/ +extern int perfmon_stopCounters(void) __attribute__ ((visibility ("default") )); +/*! \brief Read the performance monitoring counters on all CPUs + +Read the counters that have been previously started by perfmon_startCounters(). +The counters are stopped directly to avoid interference of LIKWID with the measured +code. Before returning, the counters are started again. +@return 0 on success and -(thread_id+1) for error +*/ +extern int perfmon_readCounters(void) __attribute__ ((visibility ("default") )); +/*! \brief Read the performance monitoring counters on one CPU + +Read the counters that have been previously started by perfmon_startCounters(). +The counters are stopped directly to avoid interference of LIKWID with the measured +code. Before returning, the counters are started again. Only one CPU is read. +@param [in] cpu_id CPU ID of the CPU that should be read +@return 0 on success and -(thread_id+1) for error +*/ +extern int perfmon_readCountersCpu(int cpu_id) __attribute__ ((visibility ("default") )); +/*! \brief Read the performance monitoring counters of all threads in a group + +Read the counters that have been previously started by perfmon_startCounters(). +The counters are stopped directly to avoid interference of LIKWID with the measured +code. Before returning, the counters are started again. +@param [in] groupId Read the counters for all threads taking part in group +@return 0 on success and -(thread_id+1) for error +*/ +extern int perfmon_readGroupCounters(int groupId) __attribute__ ((visibility ("default") )); +/*! \brief Read the performance monitoring counters of on thread in a group + +Read the counters that have been previously started by perfmon_startCounters(). +The counters are stopped directly to avoid interference of LIKWID with the measured +code. Before returning, the counters are started again. Only one thread's CPU is read. +@param [in] groupId Read the counters defined in group identified with groupId +@param [in] threadId Read the counters for the thread +@return 0 on success and -(thread_id+1) for error +*/ +extern int perfmon_readGroupThreadCounters(int groupId, int threadId) __attribute__ ((visibility ("default") )); +/*! \brief Switch the active eventSet to a new one + +Stops the currently running counters, switches the eventSet by setting up the +counters and start the counters. +@param [in] new_group ID of group that should be switched to. +@return 0 on success and -(thread_id+1) for error +*/ +extern int perfmon_switchActiveGroup(int new_group) __attribute__ ((visibility ("default") )); +/*! \brief Close the perfomance monitoring facility of LIKWID + +Deallocates all internal data that is used during performance monitoring. Also +the counter values are not accessible after this function. +*/ +extern void perfmon_finalize(void) __attribute__ ((visibility ("default") )); +/*! \brief Get the results of the specified group, counter and thread + +Get the result of all measurement cycles. The function takes care of happened +overflows and if the counter values need to be calculated with multipliers. +@param [in] groupId ID of the group that should be read +@param [in] eventId ID of the event that should be read +@param [in] threadId ID of the thread/cpu that should be read +@return The counter result +*/ +extern double perfmon_getResult(int groupId, int eventId, int threadId) __attribute__ ((visibility ("default") )); +/*! \brief Get the last results of the specified group, counter and thread + +Get the result of the last measurement cycle. The function takes care of happened +overflows and if the counter values need to be calculated with multipliers. +@param [in] groupId ID of the group that should be read +@param [in] eventId ID of the event that should be read +@param [in] threadId ID of the thread/cpu that should be read +@return The counter result +*/ +extern double perfmon_getLastResult(int groupId, int eventId, int threadId) __attribute__ ((visibility ("default") )); +/*! \brief Get the metric result of the specified group, counter and thread + +Get the metric result of all measurement cycles. It reads all raw results for the given groupId and threadId. +@param [in] groupId ID of the group that should be read +@param [in] metricId ID of the metric that should be calculated +@param [in] threadId ID of the thread/cpu that should be read +@return The metric result +*/ +extern double perfmon_getMetric(int groupId, int metricId, int threadId) __attribute__ ((visibility ("default") )); +/*! \brief Get the last metric result of the specified group, counter and thread + +Get the metric result of the last measurement cycle. It reads all raw results for the given groupId and threadId. +@param [in] groupId ID of the group that should be read +@param [in] metricId ID of the metric that should be calculated +@param [in] threadId ID of the thread/cpu that should be read +@return The metric result +*/ +extern double perfmon_getLastMetric(int groupId, int metricId, int threadId) __attribute__ ((visibility ("default") )); + +/*! \brief Get the number of configured event groups + +@return Number of groups +*/ +extern int perfmon_getNumberOfGroups(void) __attribute__ ((visibility ("default") )); +/*! \brief Get the number of configured eventSets in group + +@param [in] groupId ID of group +@return Number of eventSets +*/ +extern int perfmon_getNumberOfEvents(int groupId) __attribute__ ((visibility ("default") )); +/*! \brief Get the accumulated measurement time a group + +@param [in] groupId ID of group +@return Time in seconds the event group was measured +*/ +extern double perfmon_getTimeOfGroup(int groupId) __attribute__ ((visibility ("default") )); +/*! \brief Get the ID of the currently set up event group + +@return Number of active group +*/ +extern int perfmon_getIdOfActiveGroup(void) __attribute__ ((visibility ("default") )); +/*! \brief Get the number of threads specified at perfmon_init() + +@return Number of threads +*/ +extern int perfmon_getNumberOfThreads(void) __attribute__ ((visibility ("default") )); + +/*! \brief Set verbosity of LIKWID library + +*/ +extern void perfmon_setVerbosity(int verbose) __attribute__ ((visibility ("default") )); + +/*! \brief Get the event name of the specified group and event + +Get the metric name as defined in the performance group file +@param [in] groupId ID of the group that should be read +@param [in] eventId ID of the event that should be returned +@return The event name or NULL in case of failure +*/ +extern char* perfmon_getEventName(int groupId, int eventId) __attribute__ ((visibility ("default") )); +/*! \brief Get the counter name of the specified group and event + +Get the counter name as defined in the performance group file +@param [in] groupId ID of the group that should be read +@param [in] eventId ID of the event of which the counter should be returned +@return The counter name or NULL in case of failure +*/ +extern char* perfmon_getCounterName(int groupId, int eventId) __attribute__ ((visibility ("default") )); + +/*! \brief Get the name group + +Get the name of group. Either it is the name of the performance group or "Custom" +@param [in] groupId ID of the group that should be read +@return The group name or NULL in case of failure +*/ +extern char* perfmon_getGroupName(int groupId) __attribute__ ((visibility ("default") )); +/*! \brief Get the metric name of the specified group and metric + +Get the metric name as defined in the performance group file +@param [in] groupId ID of the group that should be read +@param [in] metricId ID of the metric that should be calculated +@return The metric name or NULL in case of failure +*/ +extern char* perfmon_getMetricName(int groupId, int metricId) __attribute__ ((visibility ("default") )); +/*! \brief Get the short informational string of the specified group + +Returns the short information string as defined by performance groups or "Custom" +in case of custom event sets +@param [in] groupId ID of the group that should be read +@return The short information or NULL in case of failure +*/ +extern char* perfmon_getGroupInfoShort(int groupId) __attribute__ ((visibility ("default") )); +/*! \brief Get the long descriptive string of the specified group + +Returns the long descriptive string as defined by performance groups or NULL +in case of custom event sets +@param [in] groupId ID of the group that should be read +@return The long description or NULL in case of failure +*/ +extern char* perfmon_getGroupInfoLong(int groupId) __attribute__ ((visibility ("default") )); + +/*! \brief Get the number of configured metrics for group + +@param [in] groupId ID of group +@return Number of metrics +*/ +extern int perfmon_getNumberOfMetrics(int groupId) __attribute__ ((visibility ("default") )); + +/*! \brief Get the last measurement time a group + +@param [in] groupId ID of group +@return Time in seconds the event group was measured the last time +*/ +extern double perfmon_getLastTimeOfGroup(int groupId) __attribute__ ((visibility ("default") )); + +/*! \brief Read the output file of the Marker API +@param [in] filename Filename with Marker API results +@return 0 or negative error number +*/ +extern int perfmon_readMarkerFile(const char* filename) __attribute__ ((visibility ("default") )); +/*! \brief Free space for read in Marker API file +*/ +extern void perfmon_destroyMarkerResults() __attribute__ ((visibility ("default") )); +/*! \brief Get the number of regions listed in Marker API result file + +@return Number of regions +*/ +extern int perfmon_getNumberOfRegions() __attribute__ ((visibility ("default") )); +/*! \brief Get the groupID of a region + +@param [in] region ID of region +@return Group ID of region +*/ +extern int perfmon_getGroupOfRegion(int region) __attribute__ ((visibility ("default") )); +/*! \brief Get the tag of a region +@param [in] region ID of region +@return tag of region +*/ +extern char* perfmon_getTagOfRegion(int region) __attribute__ ((visibility ("default") )); +/*! \brief Get the number of events of a region +@param [in] region ID of region +@return Number of events of region +*/ +extern int perfmon_getEventsOfRegion(int region) __attribute__ ((visibility ("default") )); +/*! \brief Get the number of metrics of a region +@param [in] region ID of region +@return Number of metrics of region +*/ +extern int perfmon_getMetricsOfRegion(int region) __attribute__ ((visibility ("default") )); +/*! \brief Get the number of threads of a region +@param [in] region ID of region +@return Number of threads of region +*/ +extern int perfmon_getThreadsOfRegion(int region) __attribute__ ((visibility ("default") )); +/*! \brief Get the cpulist of a region +@param [in] region ID of region +@param [in] count Length of cpulist array +@param [in,out] cpulist cpulist array +@return Number of threads of region or count, whatever is lower +*/ +extern int perfmon_getCpulistOfRegion(int region, int count, int* cpulist) __attribute__ ((visibility ("default") )); +/*! \brief Get the accumulated measurement time of a region for a thread +@param [in] region ID of region +@param [in] thread ID of thread +@return Measurement time of a region for a thread +*/ +extern double perfmon_getTimeOfRegion(int region, int thread) __attribute__ ((visibility ("default") )); +/*! \brief Get the call count of a region for a thread +@param [in] region ID of region +@param [in] thread ID of thread +@return Call count of a region for a thread +*/ +extern int perfmon_getCountOfRegion(int region, int thread) __attribute__ ((visibility ("default") )); +/*! \brief Get the event result of a region for an event and thread +@param [in] region ID of region +@param [in] event ID of event +@param [in] thread ID of thread +@return Result of a region for an event and thread +*/ +extern double perfmon_getResultOfRegionThread(int region, int event, int thread) __attribute__ ((visibility ("default") )); +/*! \brief Get the metric result of a region for a metric and thread +@param [in] region ID of region +@param [in] metricId ID of metric +@param [in] threadId ID of thread +@return Metric result of a region for a thread +*/ +extern double perfmon_getMetricOfRegionThread(int region, int metricId, int threadId) __attribute__ ((visibility ("default") )); + +/** @}*/ + +/* +################################################################################ +# Performance group related functions +################################################################################ +*/ + +/** \addtogroup PerfGroup performance group module + * @{ + */ + +/*! \brief The groupInfo data structure describes a performance group + +Groups can be either be read in from file or be a group with custom event set. For +performance groups commonly all values are set. For groups with custom event set, +the fields groupname and shortinfo are set to 'Custom', longinfo is NULL and in +general the nmetrics value is 0. +*/ +typedef struct { + char* groupname; /*!< \brief Name of the group: performance group name or 'Custom' */ + char* shortinfo; /*!< \brief Short info string for the group or 'Custom' */ + int nevents; /*!< \brief Number of event/counter combinations */ + char** events; /*!< \brief List of events */ + char** counters; /*!< \brief List of counter registers */ + int nmetrics; /*!< \brief Number of metrics */ + char** metricnames; /*!< \brief Metric names */ + char** metricformulas; /*!< \brief Metric formulas */ + char* longinfo; /*!< \brief Descriptive text about the group or empty */ +} GroupInfo; + +/*! \brief Initialize values in GroupInfo struct + +Initialize values in GroupInfo struct. The function does NOT allocate the GroupInfo struct +*/ +int perfgroup_new(GroupInfo* ginfo) __attribute__ ((visibility ("default") )); + +/*! \brief Add a counter and event combination to the group + +Add a counter and event combination to the group. +@param [in] ginfo GroupInfo struct +@param [in] counter String with counter name +@param [in] event String with event name +@return 0 for success, -EINVAL or -ENOMEM in case of error. +*/ +int perfgroup_addEvent(GroupInfo* ginfo, char* counter, char* event) __attribute__ ((visibility ("default") )); + +/*! \brief Remove a counter and event combination from a group + +Remove a counter and event combination from a group +@param [in] ginfo GroupInfo struct +@param [in] counter String with counter name +*/ +void perfgroup_removeEvent(GroupInfo* ginfo, char* counter) __attribute__ ((visibility ("default") )); + +/*! \brief Add a metric to the group + +Add a metric to the group +@param [in] ginfo GroupInfo struct +@param [in] mname String with metric name/description +@param [in] mcalc String with metric formula. No spaces in string. +@return 0 for success, -EINVAL or -ENOMEM in case of error. +*/ +int perfgroup_addMetric(GroupInfo* ginfo, char* mname, char* mcalc) __attribute__ ((visibility ("default") )); +/*! \brief Remove a metric from a group + +Remove a metric from a group +@param [in] ginfo GroupInfo struct +@param [in] mname String with metric name/description +*/ +void perfgroup_removeMetric(GroupInfo* ginfo, char* mname) __attribute__ ((visibility ("default") )); + +/*! \brief Get the event string of a group needed for perfmon_addEventSet + +Get the event string of a group needed for perfmon_addEventSet +@param [in] ginfo GroupInfo struct +@return String with eventset or NULL +*/ +char* perfgroup_getEventStr(GroupInfo* ginfo) __attribute__ ((visibility ("default") )); +/*! \brief Return the eventset string of a group + +Return the event string of a group +@param [in] eventStr Eventset string +*/ +void perfgroup_returnEventStr(char* eventStr) __attribute__ ((visibility ("default") )); + +/*! \brief Get the group name of a group + +Get the group name of a group +@param [in] ginfo GroupInfo struct +@return String with group name or NULL +*/ +char* perfgroup_getGroupName(GroupInfo* ginfo) __attribute__ ((visibility ("default") )); +/*! \brief Set the group name of a group + +Set the group name of a group. String must be zero-terminated +@param [in] ginfo GroupInfo struct +@param [in] groupName String with group name +@return 0 for success, -EINVAL or -ENOMEM in case of error. +*/ +int perfgroup_setGroupName(GroupInfo* ginfo, char* groupName) __attribute__ ((visibility ("default") )); +/*! \brief Return the group name string of a group + +Return the group name string of a group +@param [in] gname Group name string +*/ +void perfgroup_returnGroupName(char* gname) __attribute__ ((visibility ("default") )); + + +/*! \brief Set the short information string of a group + +Set the short information string of a group. String must be zero-terminated +@param [in] ginfo GroupInfo struct +@param [in] shortInfo String with short information +@return 0 for success, -EINVAL or -ENOMEM in case of error. +*/ +int perfgroup_setShortInfo(GroupInfo* ginfo, char* shortInfo) __attribute__ ((visibility ("default") )); +/*! \brief Get the short information string of a group + +Get the short information string of a group +@param [in] ginfo GroupInfo struct +@return String with short information or NULL +*/ +char* perfgroup_getShortInfo(GroupInfo* ginfo) __attribute__ ((visibility ("default") )); +/*! \brief Return the short information string of a group + +Return the short information string of a group +@param [in] sinfo Short information string +*/ +void perfgroup_returnShortInfo(char* sinfo) __attribute__ ((visibility ("default") )); + +/*! \brief Set the long information string of a group + +Set the long information string of a group. String must be zero-terminated +@param [in] ginfo GroupInfo struct +@param [in] longInfo String with long information +@return 0 for success, -EINVAL or -ENOMEM in case of error. +*/ +int perfgroup_setLongInfo(GroupInfo* ginfo, char* longInfo) __attribute__ ((visibility ("default") )); +/*! \brief Get the long information string of a group + +Get the long information string of a group +@param [in] ginfo GroupInfo struct +@return String with long information or NULL +*/ +char* perfgroup_getLongInfo(GroupInfo* ginfo) __attribute__ ((visibility ("default") )); +/*! \brief Return the long information string of a group + +Return the long information string of a group +@param [in] linfo Long information string +*/ +void perfgroup_returnLongInfo(char* linfo) __attribute__ ((visibility ("default") )); + +/*! \brief Merge two groups + +Merge two groups (group2 into group1). +@param [in,out] grp1 Group1 +@param [in] grp2 Group2 +@return 0 for success, -EINVAL or -ENOMEM in case of error. +*/ +int perfgroup_mergeGroups(GroupInfo* grp1, GroupInfo* grp2) __attribute__ ((visibility ("default") )); + +/*! \brief Read group from file + +Read group from file +@param [in] grouppath Base path to all groups +@param [in] architecture Architecture string (e.g. short_info in cpuid_info) +@param [in] groupname Group name +@param [in,out] ginfo Group filled with data from file +@return 0 for success, -EINVAL or -ENOMEM in case of error. +*/ +int perfgroup_readGroup(const char* grouppath, const char* architecture, const char* groupname, GroupInfo* ginfo) __attribute__ ((visibility ("default") )); +/*! \brief Create group from event string + +Create group from event string (list of event:counter(:opts)). +@param [in] eventStr event string +@param [in,out] ginfo Group filled with data from event string +@return 0 for success, -EINVAL or -ENOMEM in case of error. +*/ +int perfgroup_customGroup(const char* eventStr, GroupInfo* ginfo) __attribute__ ((visibility ("default") )); + +/*! \brief Return group + +Return group (frees internal lists) +@param [in] ginfo Performance group info +*/ +void perfgroup_returnGroup(GroupInfo* ginfo) __attribute__ ((visibility ("default") )); +/*! \brief Get all groups available in the system (base + user home) + +Get all groups available in the system (base + user home) +@param [in] grouppath Base path to all groups +@param [in] architecture Architecture string (e.g. short_info in cpuid_info) +@param [out] groupnames List of group names +@param [out] groupshort List of groups' short information string +@param [out] grouplong List of groups' long information string +@return number of groups, -EINVAL or -ENOMEM in case of error. +*/ +int perfgroup_getGroups( const char* grouppath, const char* architecture, char*** groupnames, char*** groupshort, char*** grouplong) __attribute__ ((visibility ("default") )); +/*! \brief Return list of all groups + +Return list of all groups +@param [in] groups Number of groups +@param [in] groupnames List of group names +@param [in] groupshort List of groups' short information string +@param [in] grouplong List of groups' long information string +*/ +void perfgroup_returnGroups(int groups, char** groupnames, char** groupshort, char** grouplong) __attribute__ ((visibility ("default") )); + + + + +/** @}*/ + +/* +################################################################################ +# Time measurements related functions +################################################################################ +*/ + +/** \addtogroup TimerMon Time measurement module + * @{ + */ + +/*! \brief Struct defining the start and stop time of a time interval +\extends TimerData +*/ +typedef union +{ + uint64_t int64; /*!< \brief Cycle count in 64 bit */ + struct {uint32_t lo, hi;} int32; /*!< \brief Cycle count stored in two 32 bit fields */ +} TscCounter; + +/*! \brief Struct defining the start and stop time of a time interval +*/ +typedef struct { + TscCounter start; /*!< \brief Cycles at start */ + TscCounter stop; /*!< \brief Cycles at stop */ +} TimerData; + +/*! \brief Initialize timer by retrieving baseline frequency and cpu clock +*/ +extern void timer_init( void ) __attribute__ ((visibility ("default") )); +/*! \brief Return the measured interval in seconds + +@param [in] time Structure holding the cycle count at start and stop +@return Time in seconds +*/ +extern double timer_print( const TimerData* time) __attribute__ ((visibility ("default") )); +/*! \brief Return the measured interval in cycles + +@param [in] time Structure holding the cycle count at start and stop +@return Time in cycles +*/ +extern uint64_t timer_printCycles( const TimerData* time) __attribute__ ((visibility ("default") )); +/*! \brief Reset values in TimerData + +@param [in] time Structure holding the cycle count at start and stop +*/ +extern void timer_reset( TimerData* time ) __attribute__ ((visibility ("default") )); +/*! \brief Return the CPU clock determined at timer_init + +@return CPU clock +*/ +extern uint64_t timer_getCpuClock( void ) __attribute__ ((visibility ("default") )); +/*! \brief Return the current CPU clock read from sysfs + +@return CPU clock +*/ +extern uint64_t timer_getCpuClockCurrent( int cpu_id ) __attribute__ ((visibility ("default") )); +/*! \brief Return the cycles clock determined at timer_init + +@return cycle clock +*/ +extern uint64_t timer_getCycleClock( void ) __attribute__ ((visibility ("default") )); +/*! \brief Return the baseline CPU clock determined at timer_init + +@return Baseline CPU clock +*/ +extern uint64_t timer_getBaseline( void ) __attribute__ ((visibility ("default") )); +/*! \brief Start time measurement + +@param [in,out] time Structure holding the cycle count at start +*/ +extern void timer_start( TimerData* time ) __attribute__ ((visibility ("default") )); +/*! \brief Stop time measurement + +@param [in,out] time Structure holding the cycle count at stop +*/ +extern void timer_stop ( TimerData* time) __attribute__ ((visibility ("default") )); +/*! \brief Sleep for specified usecs + +@param [in] usec Amount of usecs to sleep +*/ +extern int timer_sleep(unsigned long usec) __attribute__ ((visibility ("default") )); + +/*! \brief Finalize timer module + +*/ +extern void timer_finalize(void) __attribute__ ((visibility ("default") )); + +/** @}*/ + +/* +################################################################################ +# Power measurements related functions +################################################################################ +*/ +/** \addtogroup PowerMon Power and Energy monitoring module + * @{ + */ + +/*! +\def NUM_POWER_DOMAINS +Amount of currently supported RAPL domains +*/ +#define NUM_POWER_DOMAINS 5 +/*! \brief List of all RAPL domain names +*/ +extern const char* power_names[NUM_POWER_DOMAINS] __attribute__ ((visibility ("default") )); + +/*! +\def POWER_DOMAIN_SUPPORT_STATUS +Flag to check in PowerDomain's supportFlag if the status msr registers are available +*/ +#define POWER_DOMAIN_SUPPORT_STATUS (1ULL<<0) +/*! +\def POWER_DOMAIN_SUPPORT_LIMIT +Flag to check in PowerDomain's supportFlag if the limit msr registers are available +*/ +#define POWER_DOMAIN_SUPPORT_LIMIT (1ULL<<1) +/*! +\def POWER_DOMAIN_SUPPORT_POLICY +Flag to check in PowerDomain's supportFlag if the policy msr registers are available +*/ +#define POWER_DOMAIN_SUPPORT_POLICY (1ULL<<2) +/*! +\def POWER_DOMAIN_SUPPORT_PERF +Flag to check in PowerDomain's supportFlag if the perf msr registers are available +*/ +#define POWER_DOMAIN_SUPPORT_PERF (1ULL<<3) +/*! +\def POWER_DOMAIN_SUPPORT_INFO +Flag to check in PowerDomain's supportFlag if the info msr registers are available +*/ +#define POWER_DOMAIN_SUPPORT_INFO (1ULL<<4) + + +/*! \brief Information structure of CPU's turbo mode +\extends PowerInfo +*/ +typedef struct { + int numSteps; /*!< \brief Amount of turbo mode steps/frequencies */ + double* steps; /*!< \brief List of turbo mode steps */ +} TurboBoost; + +/*! \brief Enum for all supported RAPL domains +\extends PowerDomain +*/ +typedef enum { + PKG = 0, /*!< \brief PKG domain, mostly one CPU socket/package */ + PP0 = 1, /*!< \brief PP0 domain, not clearly defined by Intel */ + PP1 = 2, /*!< \brief PP1 domain, not clearly defined by Intel */ + DRAM = 3, /*!< \brief DRAM domain, the memory modules */ + PLATFORM = 4 /*!< \brief PLATFORM domain, the whole system (if powered through the main board) */ +} PowerType; + +/*! \brief Structure describing an RAPL power domain +\extends PowerInfo +*/ +typedef struct { + PowerType type; /*!< \brief Identifier which RAPL domain is managed by this struct */ + uint32_t supportFlags; /*!< \brief Bitmask which features are supported by the power domain */ + double energyUnit; /*!< \brief Multiplier for energy measurements */ + double tdp; /*!< \brief Thermal Design Power (maximum amount of heat generated by the CPU) */ + double minPower; /*!< \brief Minimal power consumption of the CPU */ + double maxPower; /*!< \brief Maximal power consumption of the CPU */ + double maxTimeWindow; /*!< \brief Minimal power measurement interval */ +} PowerDomain; + +/*! \brief Information structure of CPU's power measurement facility +*/ +typedef struct { + double baseFrequency; /*!< \brief Base frequency of the CPU */ + double minFrequency; /*!< \brief Minimal frequency of the CPU */ + TurboBoost turbo; /*!< \brief Turbo boost information */ + int hasRAPL; /*!< \brief RAPL support flag */ + double powerUnit; /*!< \brief Multiplier for power measurements */ + double timeUnit; /*!< \brief Multiplier for time information */ + double uncoreMinFreq; /*!< \brief Minimal uncore frequency */ + double uncoreMaxFreq; /*!< \brief Maximal uncore frequency */ + uint8_t perfBias; /*!< \brief Performance energy bias */ + PowerDomain domains[NUM_POWER_DOMAINS]; /*!< \brief List of power domains */ +} PowerInfo; + +/*! \brief Power measurement data for start/stop measurements +*/ +typedef struct { + int domain; /*!< \brief RAPL domain identifier */ + uint32_t before; /*!< \brief Counter state at start */ + uint32_t after; /*!< \brief Counter state at stop */ +} PowerData; + +/*! \brief Variable holding the global power information structure */ +extern PowerInfo power_info; + +/** \brief Pointer for exporting the PowerInfo data structure */ +typedef PowerInfo* PowerInfo_t; +/** \brief Pointer for exporting the PowerData data structure */ +typedef PowerData* PowerData_t; + +/*! \brief Initialize energy measurements on specific CPU + +Additionally, it reads basic information about the energy measurements like +minimal measurement time. +@param [in] cpuId Initialize energy facility for this CPU +@return RAPL status (0=No RAPL, 1=RAPL working) +*/ +extern int power_init(int cpuId) __attribute__ ((visibility ("default") )); +/*! \brief Get a pointer to the energy facility information + +@return PowerInfo_t pointer +\sa PowerInfo_t +*/ +extern PowerInfo_t get_powerInfo(void) __attribute__ ((visibility ("default") )); +/*! \brief Read the current power value + +@param [in] cpuId Read energy facility for this CPU +@param [in] reg Energy register +@param [out] data Energy data +*/ +extern int power_read(int cpuId, uint64_t reg, uint32_t *data) __attribute__ ((visibility ("default") )); +/*! \brief Read the current energy value using a specific communication socket + +@param [in] socket_fd Communication socket for the read operation +@param [in] cpuId Read energy facility for this CPU +@param [in] reg Energy register +@param [out] data Energy data +*/ +extern int power_tread(int socket_fd, int cpuId, uint64_t reg, uint32_t *data) __attribute__ ((visibility ("default") )); +/*! \brief Start energy measurements + +@param [in,out] data Data structure holding start and stop values for energy measurements +@param [in] cpuId Start energy facility for this CPU +@param [in] type Which type should be measured +@return error code +*/ +extern int power_start(PowerData_t data, int cpuId, PowerType type) __attribute__ ((visibility ("default") )); +/*! \brief Stop energy measurements + +@param [in,out] data Data structure holding start and stop values for energy measurements +@param [in] cpuId Start energy facility for this CPU +@param [in] type Which type should be measured +@return error code +*/ +extern int power_stop(PowerData_t data, int cpuId, PowerType type) __attribute__ ((visibility ("default") )); +/*! \brief Print energy measurements gathered by power_start() and power_stop() + +@param [in] data Data structure holding start and stop values for energy measurements +@return Consumed energy in Joules +*/ +extern double power_printEnergy(const PowerData* data) __attribute__ ((visibility ("default") )); +/*! \brief Get energy Unit + +@param [in] domain RAPL domain ID +@return Energy unit of the given RAPL domain +*/ +extern double power_getEnergyUnit(int domain) __attribute__ ((visibility ("default") )); + +/*! \brief Get the values of the limit register of a domain +NOT IMPLEMENTED + +@param [in] cpuId CPU ID +@param [in] domain RAPL domain ID +@param [out] power Energy limit +@param [out] time Time limit +@return error code +*/ +int power_limitGet(int cpuId, PowerType domain, double* power, double* time) __attribute__ ((visibility ("default") )); + +/*! \brief Set the values of the limit register of a domain +NOT IMPLEMENTED + +@param [in] cpuId CPU ID +@param [in] domain RAPL domain ID +@param [in] power Energy limit +@param [in] time Time limit +@param [in] doClamping Activate clamping (going below OS-requested power level) +@return error code +*/ +int power_limitSet(int cpuId, PowerType domain, double power, double time, int doClamping) __attribute__ ((visibility ("default") )); + +/*! \brief Get the state of a energy limit, activated or deactivated +NOT IMPLEMENTED + +@param [in] cpuId CPU ID +@param [in] domain RAPL domain ID +@return state, 1 for active, 0 for inactive +*/ +int power_limitState(int cpuId, PowerType domain) __attribute__ ((visibility ("default") )); + +/*! \brief Free space of power_unit +*/ +extern void power_finalize(void) __attribute__ ((visibility ("default") )); +/** @}*/ + +/* +################################################################################ +# Thermal measurements related functions +################################################################################ +*/ +/** \addtogroup ThermalMon Thermal monitoring module + * @{ + */ +/*! \brief Initialize thermal measurements on specific CPU + +@param [in] cpuId Initialize thermal facility for this CPU +*/ +extern void thermal_init(int cpuId) __attribute__ ((visibility ("default") )); +/*! \brief Read the current thermal value + +@param [in] cpuId Read thermal facility for this CPU +@param [out] data Thermal data +*/ +extern int thermal_read(int cpuId, uint32_t *data) __attribute__ ((visibility ("default") )); +/*! \brief Read the current thermal value using a specific communication socket + +@param [in] socket_fd Communication socket for the read operation +@param [in] cpuId Read thermal facility for this CPU +@param [out] data Thermal data +*/ +extern int thermal_tread(int socket_fd, int cpuId, uint32_t *data) __attribute__ ((visibility ("default") )); +/** @}*/ + + +/* +################################################################################ +# Memory sweeping related functions +################################################################################ +*/ +/** \addtogroup MemSweep Memory sweeping module + * @{ + */ +/*! \brief Sweeping the memory of a NUMA node + +Sweeps (zeros) the memory of NUMA node with ID \a domainId +@param [in] domainId NUMA node ID +*/ +extern void memsweep_domain(int domainId) __attribute__ ((visibility ("default") )); +/*! \brief Sweeping the memory of all NUMA nodes covered by CPU list + +Sweeps (zeros) the memory of all NUMA nodes containing the CPUs in \a processorList +@param [in] processorList List of CPU IDs +@param [in] numberOfProcessors Number of CPUs in list +*/ +extern void memsweep_threadGroup(const int* processorList, int numberOfProcessors) __attribute__ ((visibility ("default") )); +/** @}*/ + +/* +################################################################################ +# CPU feature related functions +################################################################################ +*/ +/** \addtogroup CpuFeatures Retrieval and manipulation of processor features + * @{ + */ +/*! \brief Enumeration of all CPU related features. +*/ +typedef enum { + FEAT_HW_PREFETCHER=0, /*!< \brief Hardware prefetcher */ + FEAT_CL_PREFETCHER, /*!< \brief Adjacent cache line prefetcher */ + FEAT_DCU_PREFETCHER, /*!< \brief DCU L1 data cache prefetcher */ + FEAT_IP_PREFETCHER, /*!< \brief IP L1 data cache prefetcher */ + FEAT_FAST_STRINGS, /*!< \brief Fast-strings feature */ + FEAT_THERMAL_CONTROL, /*!< \brief Automatic Thermal Control Circuit */ + FEAT_PERF_MON, /*!< \brief Hardware performance monitoring */ + FEAT_FERR_MULTIPLEX, /*!< \brief FERR# Multiplexing, must be 1 for XAPIC interrupt model */ + FEAT_BRANCH_TRACE_STORAGE, /*!< \brief Branch Trace Storage */ + FEAT_XTPR_MESSAGE, /*!< \brief xTPR Message to set processor priority */ + FEAT_PEBS, /*!< \brief Precise Event Based Sampling (PEBS) */ + FEAT_SPEEDSTEP, /*!< \brief Enhanced Intel SpeedStep Technology to reduce energy consumption*/ + FEAT_MONITOR, /*!< \brief MONITOR/MWAIT feature to monitor write-back stores*/ + FEAT_SPEEDSTEP_LOCK, /*!< \brief Enhanced Intel SpeedStep Technology Select Lock */ + FEAT_CPUID_MAX_VAL, /*!< \brief Limit CPUID Maxval */ + FEAT_XD_BIT, /*!< \brief Execute Disable Bit */ + FEAT_DYN_ACCEL, /*!< \brief Intel Dynamic Acceleration */ + FEAT_TURBO_MODE, /*!< \brief Intel Turbo Mode */ + FEAT_TM2, /*!< \brief Thermal Monitoring 2 */ + CPUFEATURES_MAX +} CpuFeature; + +/*! \brief Initialize the internal feature variables for all CPUs + +Initialize the internal feature variables for all CPUs +*/ +extern void cpuFeatures_init() __attribute__ ((visibility ("default") )); +/*! \brief Print state of all CPU features for a given CPU + +Print state of all CPU features for a given CPU +@param [in] cpu CPU ID +*/ +extern void cpuFeatures_print(int cpu) __attribute__ ((visibility ("default") )); +/*! \brief Get state of a CPU feature for a given CPU + +Get state of a CPU feature for a given CPU +@param [in] cpu CPU ID +@param [in] type CPU feature +@return State of CPU feature (1=enabled, 0=disabled) +*/ +extern int cpuFeatures_get(int cpu, CpuFeature type) __attribute__ ((visibility ("default") )); +/*! \brief Get the name of a CPU feature + +Get the name of a CPU feature +@param [in] type CPU feature +@return Name of the CPU feature or NULL if feature is not available +*/ +extern char* cpuFeatures_name(CpuFeature type) __attribute__ ((visibility ("default") )); +/*! \brief Enable a CPU feature for a specific CPU + +Enable a CPU feature for a specific CPU. Only the state of the prefetchers can be changed, all other features return -EINVAL +@param [in] cpu CPU ID +@param [in] type CPU feature +@param [in] print Print outcome of operation +@return Status of operation (0=success, all others are erros, either by MSR access or invalid feature) +*/ +extern int cpuFeatures_enable(int cpu, CpuFeature type, int print) __attribute__ ((visibility ("default") )); +/*! \brief Disable a CPU feature for a specific CPU + +Disable a CPU feature for a specific CPU. Only the state of the prefetchers can be changed, all other features return -EINVAL +@param [in] cpu CPU ID +@param [in] type CPU feature +@param [in] print Print outcome of operation +@return Status of operation (0=success, all others are erros, either by MSR access or invalid feature) +*/ +extern int cpuFeatures_disable(int cpu, CpuFeature type, int print) __attribute__ ((visibility ("default") )); +/** @}*/ + + +/* +################################################################################ +# CPU frequency related functions +################################################################################ +*/ +/** \addtogroup CpuFreq Retrieval and manipulation of processor clock frequencies + * @{ + */ +/*! \brief Initialize cpu frequency module + +Initialize cpu frequency module +@return returns 0 if successfull and 1 if invalid accessmode +*/ +extern int freq_init(void) __attribute__ ((visibility ("default") )); +/*! \brief Get the base clock frequency of a hardware thread + +Get the base clock frequency of a hardware thread +@param [in] cpu_id CPU ID +@return Frequency or 0 in case of errors +*/ +uint64_t freq_getCpuClockBase(const int cpu_id) __attribute__ ((visibility ("default") )); +/*! \brief Get the current clock frequency of a hardware thread + +Get the current clock frequency of a hardware thread +@param [in] cpu_id CPU ID +@return Frequency or 0 in case of errors +*/ +extern uint64_t freq_getCpuClockCurrent(const int cpu_id ) __attribute__ ((visibility ("default") )); + +/*! \brief Get the maximal clock frequency of a hardware thread + +Get the maximal clock frequency of a hardware thread +@param [in] cpu_id CPU ID +@return Frequency or 0 in case of errors +*/ +extern uint64_t freq_getCpuClockMax(const int cpu_id ) __attribute__ ((visibility ("default") )); +/*! \brief Get the maximal available clock frequency of a hardware thread + +Get the maximal clock frequency of a hardware thread +@param [in] cpu_id CPU ID +@return Frequency or 0 in case of errors +*/ +extern uint64_t freq_getConfCpuClockMax(const int cpu_id) __attribute__ ((visibility ("default") )); +/*! \brief Set the maximal clock frequency of a hardware thread + +Set the maximal clock frequency of a hardware thread +@param [in] cpu_id CPU ID +@param [in] freq Frequency in kHz +@return Frequency or 0 in case of errors +*/ +extern uint64_t freq_setCpuClockMax(const int cpu_id, const uint64_t freq) __attribute__ ((visibility ("default") )); +/*! \brief Get the minimal clock frequency of a hardware thread + +Get the minimal clock frequency of a hardware thread +@param [in] cpu_id CPU ID +@return Frequency or 0 in case of errors +*/ +extern uint64_t freq_getCpuClockMin(const int cpu_id ) __attribute__ ((visibility ("default") )); +/*! \brief Get the minimal available clock frequency of a hardware thread + +Get the minimal clock frequency of a hardware thread +@param [in] cpu_id CPU ID +@return Frequency or 0 in case of errors +*/ +extern uint64_t freq_getConfCpuClockMin(const int cpu_id) __attribute__ ((visibility ("default") )); +/*! \brief Set the minimal clock frequency of a hardware thread + +Set the minimal clock frequency of a hardware thread +@param [in] cpu_id CPU ID +@param [in] freq Frequency in kHz +@return Frequency or 0 in case of errors +*/ +extern uint64_t freq_setCpuClockMin(const int cpu_id, const uint64_t freq) __attribute__ ((visibility ("default") )); +/*! \brief De/Activate turbo mode for a hardware thread + +De/Activate turbo mode for a hardware thread +@param [in] cpu_id CPU ID +@param [in] turbo (0=off, 1=on) +@return 1 or 0 in case of errors +*/ +extern int freq_setTurbo(const int cpu_id, int turbo) __attribute__ ((visibility ("default") )); +/*! \brief Get state of turbo mode for a hardware thread + +Get state of turbo mode for a hardware thread +@param [in] cpu_id CPU ID +@return 1=Turbo active or 0=Turbo inactive +*/ +extern int freq_getTurbo(const int cpu_id) __attribute__ ((visibility ("default") )); +/*! \brief Get the frequency governor of a hardware thread + +Get the frequency governor of a hardware thread. The returned string must be freed by the caller. +@param [in] cpu_id CPU ID +@return Governor or NULL in case of errors +*/ +extern char * freq_getGovernor(const int cpu_id ) __attribute__ ((visibility ("default") )); +/*! \brief Set the frequency governor of a hardware thread + +Set the frequency governor of a hardware thread. +@param [in] cpu_id CPU ID +@param [in] gov Governor +@return 1 or 0 in case of errors +*/ +extern int freq_setGovernor(const int cpu_id, const char* gov) __attribute__ ((visibility ("default") )); +/*! \brief Get the available frequencies of a hardware thread + +Get the available frequencies of a hardware thread. The returned string must be freed by the caller. +@param [in] cpu_id CPU ID +@return String with available frequencies or NULL in case of errors +*/ +extern char * freq_getAvailFreq(const int cpu_id ) __attribute__ ((visibility ("default") )); +/*! \brief Get the available frequency governors of a hardware thread + +Get the available frequency governors of a hardware thread. The returned string must be freed by the caller. +@param [in] cpu_id CPU ID +@return String with available frequency governors or NULL in case of errors +*/ +extern char * freq_getAvailGovs(const int cpu_id ) __attribute__ ((visibility ("default") )); + +/*! \brief Set the minimal Uncore frequency + +Set the minimal Uncore frequency. Since the ranges are not documented, valid frequencies are from minimal CPU clock to maximal Turbo clock. If selecting a frequency at the borders, please check the result with the UNCORE_CLOCK event to be effective. +@param [in] socket_id ID of socket +@param [in] freq Frequency in MHz +@return 0 for success, -ERROR at failure +*/ +extern int freq_setUncoreFreqMin(const int socket_id, const uint64_t freq) __attribute__ ((visibility ("default") )); + +/*! \brief Get the minimal Uncore frequency + +Get the minimal Uncore frequency. +@param [in] socket_id ID of socket +@return frequency in MHz or 0 at failure +*/ +extern uint64_t freq_getUncoreFreqMin(const int socket_id) __attribute__ ((visibility ("default") )); + +/*! \brief Set the maximal Uncore frequency + +Set the maximal Uncore frequency. Since the ranges are not documented, valid frequencies are from minimal CPU clock to maximal Turbo clock. If selecting a frequency at the borders, please check the result with the UNCORE_CLOCK event to be effective. +@param [in] socket_id ID of socket +@param [in] freq Frequency in MHz +@return 0 for success, -ERROR at failure +*/ +extern int freq_setUncoreFreqMax(const int socket_id, const uint64_t freq) __attribute__ ((visibility ("default") )); + +/*! \brief Get the maximal Uncore frequency + +Get the maximal Uncore frequency. +@param [in] socket_id ID of socket +@return frequency in MHz or 0 at failure +*/ +extern uint64_t freq_getUncoreFreqMax(const int socket_id) __attribute__ ((visibility ("default") )); +/*! \brief Get the current Uncore frequency + +Get the current Uncore frequency. +@param [in] socket_id ID of socket +@return frequency in MHz or 0 at failure +*/ +extern uint64_t freq_getUncoreFreqCur(const int socket_id) __attribute__ ((visibility ("default") )); +/*! \brief Finalize cpu frequency module + +Finalize cpu frequency module +*/ +extern void freq_finalize(void) __attribute__ ((visibility ("default") )); +/** @}*/ + + +/* +################################################################################ +# Performance monitoring for NVIDIA GPUs related functions +################################################################################ +*/ +/** \addtogroup Nvmon Performance monitoring for NVIDIA GPUs + * @{ + */ + +#if defined(LIKWID_WITH_NVMON) || defined(LIKWID_NVMON) +/*! \brief Structure with general GPU information for each device + +General information covers GPU devid, name and clock and memory specific information. +Most information comes from cuDeviceGetProperties() and cuDeviceGetAttribute(). +*/ +typedef struct { + int devid; /*!< \brief Device ID */ + int numaNode; /*!< \brief Closest NUMA domain to the device */ + char* name; /*!< \brief Name of the device */ + char* short_name; /*!< \brief Short name of the device */ + uint64_t mem; /*!< \brief Total memory of device */ + int ccapMajor; /*!< \brief Major number of device's compute capability */ + int ccapMinor; /*!< \brief Minor number of device's compute capability */ + int maxThreadsPerBlock; /*!< \brief Maximam number of thread per block */ + int maxThreadsDim[3]; /*!< \brief Maximum sizes of each dimension of a block */ + int maxGridSize[3]; /*!< \brief Maximum sizes of each dimension of a grid */ + int sharedMemPerBlock; /*!< \brief Total amount of shared memory available per block */ + int totalConstantMemory; /*!< \brief Total amount of constant memory available on the device */ + int simdWidth; /*!< \brief SIMD width of arithmetic units = warp size */ + int memPitch; /*!< \brief Maximum pitch allowed by the memory copy functions that involve memory regions allocated through cuMemAllocPitch() */ + int regsPerBlock; /*!< \brief Total number of registers available per block */ + int clockRatekHz; /*!< \brief Clock frequency in kilohertz */ + int textureAlign; /*!< \brief Alignment requirement */ + int surfaceAlign; /*!< \brief Alignment requirement for surfaces */ + int l2Size; /*!< \brief L2 cache in bytes. 0 if the device doesn't have L2 cache */ + int memClockRatekHz; /*!< \brief Peak memory clock frequency in kilohertz */ + int pciBus; /*!< \brief PCI bus identifier of the device */ + int pciDev; /*!< \brief PCI device (also known as slot) identifier of the device */ + int pciDom; /*!< \brief PCI domain identifier of the device */ + int maxBlockRegs; /*!< \brief Maximum number of 32-bit registers available to a thread block */ + int numMultiProcs; /*!< \brief Number of multiprocessors on the device */ + int maxThreadPerMultiProc; /*!< \brief Maximum resident threads per multiprocessor */ + int memBusWidth; /*!< \brief Global memory bus width in bits */ + int unifiedAddrSpace; /*!< \brief 1 if the device shares a unified address space with the host, or 0 if not */ + int ecc; /*!< \brief 1 if error correction is enabled on the device, 0 if error correction is disabled or not supported by the device */ + int asyncEngines; /*!< \brief Number of asynchronous engines */ + int mapHostMem; /*!< \brief 1 if the device can map host memory into the CUDA address space */ + int integrated; /*!< \brief 1 if the device is an integrated (motherboard) GPU and 0 if it is a discrete (card) component */ +} GpuDevice; + + +/*! \brief Structure holding information of all GPUs + +*/ +typedef struct { + int numDevices; /*!< \brief Number of detected devices */ + GpuDevice* devices; /*!< \brief List with GPU-specific topology information */ +} GpuTopology; + +/*! \brief Variable holding the global gpu information structure */ +extern GpuTopology gpuTopology; +/** \brief Pointer for exporting the GpuTopology data structure */ +typedef GpuTopology* GpuTopology_t; + + +/*! \brief Initialize GPU topology information + +Reads in the topology information from the CUDA library (if found). +\sa GpuTopology_t +@return 0 or -errno in case of error +*/ +extern int topology_gpu_init(void) __attribute__ ((visibility ("default") )); +/*! \brief Destroy GPU topology structure GpuTopology_t + +Retrieved pointers to the structures are not valid anymore after this function call +\sa GpuTopology_t +*/ +extern void topology_gpu_finalize(void) __attribute__ ((visibility ("default") )); +/*! \brief Retrieve GPU topology of the current machine + +\sa GpuTopology_t +@return GpuTopology_t (pointer to internal gpuTopology structure) +*/ +extern GpuTopology_t get_gpuTopology(void) __attribute__ ((visibility ("default") )); + + +/* +################################################################################ +# NvMarker API related functions +################################################################################ +*/ +/** \addtogroup NvMarkerAPI Marker API module for GPUs +* @{ +*/ +/*! \brief Initialize NvLIKWID's marker API + +Must be called in serial region of the application to set up basic data structures +of LIKWID. +Reads environment variables: +- LIKWID_GEVENTS (GPU event string) +- LIKWID_GPUS (GPU list separated by ,) +- LIKWID_GPUFILEPATH (Outputpath for NvMarkerAPI file) +*/ +extern void likwid_gpuMarkerInit(void) __attribute__ ((visibility ("default") )); +/*! \brief Select next group to measure + +Must be called in parallel region of the application to switch group on every CPU. +*/ +extern void likwid_gpuMarkerNextGroup(void) __attribute__ ((visibility ("default") )); +/*! \brief Close LIKWID's NvMarker API + +Must be called in serial region of the application. It gathers all data of regions and +writes them out to a file (filepath in env variable LIKWID_FILEPATH). +*/ +extern void likwid_gpuMarkerClose(void) __attribute__ ((visibility ("default") )); +/*! \brief Register a measurement region + +Initializes the hashTable entry in order to reduce execution time of likwid_gpuMarkerStartRegion() +@param regionTag [in] Initialize data using this string +@return Error code +*/ +extern int likwid_gpuMarkerRegisterRegion(const char* regionTag) __attribute__ ((visibility ("default") )); +/*! \brief Start a measurement region + +Reads the values of all configured counters and saves the results under the name given +in regionTag. +@param regionTag [in] Store data using this string +@return Error code of start operation +*/ +extern int likwid_gpuMarkerStartRegion(const char* regionTag) __attribute__ ((visibility ("default") )); +/*! \brief Stop a measurement region + +Reads the values of all configured counters and saves the results under the name given +in regionTag. The measurement data of the stopped region gets summed up in global region counters. +@param regionTag [in] Store data using this string +@return Error code of stop operation +*/ +extern int likwid_gpuMarkerStopRegion(const char* regionTag) __attribute__ ((visibility ("default") )); +/*! \brief Reset a measurement region + +Reset the values of all configured counters and timers. +@param regionTag [in] Reset data using this string +@return Error code of reset operation +*/ +extern int likwid_gpuMarkerResetRegion(const char* regionTag) __attribute__ ((visibility ("default") )); +/*! \brief Get accumulated data of a code region + +Get the accumulated data of the current thread for the given regionTag. +@param regionTag [in] Print data using this string +@param nr_gpus [in,out] Length of first dimension of the arrys. Afterwards the actual count of GPUs. +@param nr_events [in,out] Length of events array +@param events [out] Events array for the intermediate results +@param time [out] Accumulated measurement time +@param count [out] Call count of the code region +*/ +extern void likwid_gpuMarkerGetRegion(const char* regionTag, int* nr_gpus, int* nr_events, double** events, double **time, int **count) __attribute__ ((visibility ("default") )); + +/*! \brief Read the output file of the NvMarker API +@param [in] filename Filename with NvMarker API results +@return 0 or negative error number +*/ +int nvmon_readMarkerFile(const char* filename) __attribute__ ((visibility ("default") )); +/*! \brief Free space for read in NvMarker API file +*/ +void nvmon_destroyMarkerResults() __attribute__ ((visibility ("default") )); +/*! \brief Get the number of regions listed in NvMarker API result file + +@return Number of regions +*/ +int nvmon_getNumberOfRegions() __attribute__ ((visibility ("default") )); +/*! \brief Get the number of metrics of a region +@param [in] region ID of region +@return Number of metrics of region +*/ +int nvmon_getMetricsOfRegion(int region) __attribute__ ((visibility ("default") )); +/*! \brief Get the number of GPUs of a region +@param [in] region ID of region +@return Number of GPUs of region +*/ +int nvmon_getGpusOfRegion(int region) __attribute__ ((visibility ("default") )); +/*! \brief Get the GPU list of a region +@param [in] region ID of region +@param [in] count Length of gpulist array +@param [in,out] gpulist gpulist array +@return Number of GPUs of region or count, whatever is lower +*/ +int nvmon_getGpulistOfRegion(int region, int count, int* gpulist) __attribute__ ((visibility ("default") )); +/*! \brief Get the accumulated measurement time of a region for a GPU +@param [in] region ID of region +@param [in] gpu ID of GPU +@return Measurement time of a region for a GPU +*/ +double nvmon_getTimeOfRegion(int region, int gpu) __attribute__ ((visibility ("default") )); +/*! \brief Get the call count of a region for a GPU +@param [in] region ID of region +@param [in] gpu ID of GPU +@return Call count of a region for a GPU +*/ +int nvmon_getCountOfRegion(int region, int gpu) __attribute__ ((visibility ("default") )); +/*! \brief Get the groupID of a region + +@param [in] region ID of region +@return Group ID of region +*/ +int nvmon_getGroupOfRegion(int region) __attribute__ ((visibility ("default") )); +/*! \brief Get the tag of a region +@param [in] region ID of region +@return tag of region +*/ +char* nvmon_getTagOfRegion(int region) __attribute__ ((visibility ("default") )); +/*! \brief Get the number of events of a region +@param [in] region ID of region +@return Number of events of region +*/ +int nvmon_getEventsOfRegion(int region) __attribute__ ((visibility ("default") )); +/*! \brief Get the event result of a region for an event and GPU +@param [in] region ID of region +@param [in] eventId ID of event +@param [in] gpuId ID of GPU +@return Result of a region for an event and GPU +*/ +double nvmon_getResultOfRegionGpu(int region, int eventId, int gpuId) __attribute__ ((visibility ("default") )); +/*! \brief Get the metric result of a region for a metric and GPU +@param [in] region ID of region +@param [in] metricId ID of metric +@param [in] gpuId ID of GPU +@return Metric result of a region for a GPU +*/ +double nvmon_getMetricOfRegionGpu(int region, int metricId, int gpuId) __attribute__ ((visibility ("default") )); + +/** @}*/ + +/* +################################################################################ +# Nvmon related functions (Nvidia GPU monitoring) +################################################################################ +*/ + +/** \addtogroup Nvmon Nvidia GPU monitoring API module for GPUs +* @{ +*/ + +/*! \brief Element in the output list from nvmon_getEventsOfGpu + +It holds the name, the description and the limitation string for one event. +*/ +typedef struct { + char* name; /*!< \brief Name of the event */ + char* desc; /*!< \brief Description of the event */ + char* limit; /*!< \brief Limitation string of the event, commonly 'GPU' */ +} NvmonEventListEntry; + +/*! \brief Output list from nvmon_getEventsOfGpu with all supported events + +Output list from nvmon_getEventsOfGpu with all supported events +*/ +typedef struct { + int numEvents; /*!< \brief Number of events */ + NvmonEventListEntry *events; /*!< \brief List of events */ +} NvmonEventList; +/** \brief Pointer for exporting the NvmonEventList data structure */ +typedef NvmonEventList* NvmonEventList_t; + + +/*! \brief Get the list of supported event of a GPU + +@param [in] gpuId ID of GPU (from GPU topology) +@param [out] list List of events +@return Number of supported events or -errno +*/ +int nvmon_getEventsOfGpu(int gpuId, NvmonEventList_t* list); +/*! \brief Return the list of supported event of a GPU + +Return the list of supported event of a GPU from nvmon_getEventsOfGpu() +@param [in] list List of events +*/ +void nvmon_returnEventsOfGpu(NvmonEventList_t list); + + +/*! \brief Initialize the Nvidia GPU performance monitoring facility (Nvmon) + +Initialize the Nvidia GPU performance monitoring feature by creating basic data structures. +The CUDA and CUPTI library paths need to be in LD_LIBRARY_PATH to be found by dlopen. + +@param [in] nrGpus Amount of GPUs +@param [in] gpuIds List of GPUs +@return error code (0 on success, -ERRORCODE on failure) +*/ +int nvmon_init(int nrGpus, const int* gpuIds) __attribute__ ((visibility ("default") )); + +/*! \brief Close the Nvidia GPU perfomance monitoring facility of LIKWID (Nvmon) + +Deallocates all internal data that is used during Nvmon performance monitoring. Also +the counter values are not accessible anymore after calling this function. +*/ +void nvmon_finalize(void) __attribute__ ((visibility ("default") )); +/*! \brief Add an event string to LIKWID Nvmon + +A event string looks like Eventname:Countername,... +The eventname and countername are checked if they are available. + +@param [in] eventCString Event string +@return Returns the ID of the new eventSet +*/ +int nvmon_addEventSet(const char* eventCString) __attribute__ ((visibility ("default") )); +/*! \brief Setup all Nvmon performance monitoring counters of an eventSet + +@param [in] gid (returned from perfmon_addEventSet() +@return error code (-ENOENT if groupId is invalid and -1 if the counters of one CPU cannot be set up) +*/ +int nvmon_setupCounters(int gid) __attribute__ ((visibility ("default") )); +/*! \brief Start Nvmon performance monitoring counters + +Start the counters that have been previously set up by nvmon_setupCounters(). +The counter registered are zeroed before enabling the counters +@return 0 on success and -(gpuid+1) for error +*/ +int nvmon_startCounters(void) __attribute__ ((visibility ("default") )); +/*! \brief Stop Nvmon performance monitoring counters + +Stop the counters that have been previously started by nvmon_startCounters(). +@return 0 on success and -(gpuid+1) for error +*/ +int nvmon_stopCounters(void) __attribute__ ((visibility ("default") )); +/*! \brief Read the Nvmon performance monitoring counters on all GPUs + +Read the counters that have been previously started by nvmon_startCounters(). +@return 0 on success and -(gpuid+1) for error +*/ +int nvmon_readCounters(void) __attribute__ ((visibility ("default") )); +/*! \brief Switch the active eventSet to a new one (Nvmon) + +Stops the currently running counters, switches the eventSet by setting up the +counters and start the counters. +@param [in] new_group ID of group that should be switched to. +@return 0 on success and -(thread_id+1) for error +*/ +int nvmon_switchActiveGroup(int new_group) __attribute__ ((visibility ("default") )); +/*! \brief Set verbosity of LIKWID Nvmon library + +*/ +void nvmon_setVerbosity(int level) __attribute__ ((visibility ("default") )); + +/*! \brief Get the results of the specified group, counter and GPU (Nvmon) + +Get the result of all measurement cycles. +@param [in] groupId ID of the group that should be read +@param [in] eventId ID of the event that should be read +@param [in] gpuId ID of the GPU that should be read +@return The counter result +*/ +double nvmon_getResult(int groupId, int eventId, int gpuId) __attribute__ ((visibility ("default") )); +/*! \brief Get the last results of the specified group, counter and GPU (Nvmon) + +Get the result of the last measurement cycle (between start/stop, start/read, read/read or read/top). +@param [in] groupId ID of the group that should be read +@param [in] eventId ID of the event that should be read +@param [in] gpuId ID of the GPU that should be read +@return The counter result +*/ +double nvmon_getLastResult(int groupId, int eventId, int gpuId) __attribute__ ((visibility ("default") )); +/*! \brief Get the metric result of the specified group, counter and GPU (Nvmon) + +Get the metric result of all measurement cycles. It reads all raw results for the given groupId and gpuId. +@param [in] groupId ID of the group that should be read +@param [in] metricId ID of the metric that should be calculated +@param [in] gpuId ID of the GPU that should be read +@return The metric result +*/ +double nvmon_getMetric(int groupId, int metricId, int gpuId); +/*! \brief Get the last metric result of the specified group, counter and GPU (Nvmon) + +Get the metric result of the last measurement cycle. It reads all raw results for the given groupId and gpuId. +@param [in] groupId ID of the group that should be read +@param [in] metricId ID of the metric that should be calculated +@param [in] gpuId ID of the GPU that should be read +@return The metric result +*/ +double nvmon_getLastMetric(int groupId, int metricId, int gpuId); +/*! \brief Get the number of configured event groups (Nvmon) + +@return Number of groups +*/ +int nvmon_getNumberOfGroups(void) __attribute__ ((visibility ("default") )); +/*! \brief Get the ID of the currently set up event group (Nvmon) + +@return Number of active group +*/ +int nvmon_getIdOfActiveGroup(void) __attribute__ ((visibility ("default") )); +/*! \brief Get the number of GPUs specified at nvmon_init() (Nvmon) + +@return Number of GPUs +*/ +int nvmon_getNumberOfGPUs(void) __attribute__ ((visibility ("default") )); +/*! \brief Get the number of configured eventSets in group (Nvmon) + +@param [in] groupId ID of group +@return Number of eventSets +*/ +int nvmon_getNumberOfEvents(int groupId) __attribute__ ((visibility ("default") )); +/*! \brief Get the number of configured metrics for group (Nvmon) + +@param [in] groupId ID of group +@return Number of metrics +*/ +int nvmon_getNumberOfMetrics(int groupId) __attribute__ ((visibility ("default") )); +/*! \brief Get the accumulated measurement time a group (Nvmon) + +@param [in] groupId ID of group +@return Time in seconds the event group was measured +*/ +double nvmon_getTimeOfGroup(int groupId) __attribute__ ((visibility ("default") )); +/*! \brief Get the last measurement time a group (Nvmon) + +@param [in] groupId ID of group +@return Time in seconds the event group was measured the last time +*/ +double nvmon_getLastTimeOfGroup(int groupId) __attribute__ ((visibility ("default") )); +/*! \brief Get the event name of the specified group and event (Nvmon) + +Get the metric name as defined in the performance group file +@param [in] groupId ID of the group that should be read +@param [in] eventId ID of the event that should be returned +@return The event name or NULL in case of failure +*/ +char* nvmon_getEventName(int groupId, int eventId) __attribute__ ((visibility ("default") )); +/*! \brief Get the counter name of the specified group and event (Nvmon) + +Get the counter name as defined in the performance group file +@param [in] groupId ID of the group that should be read +@param [in] eventId ID of the event of which the counter should be returned +@return The counter name or NULL in case of failure +*/ +char* nvmon_getCounterName(int groupId, int eventId) __attribute__ ((visibility ("default") )); +/*! \brief Get the metric name of the specified group and metric (Nvmon) + +Get the metric name as defined in the performance group file +@param [in] groupId ID of the group that should be read +@param [in] metricId ID of the metric that should be calculated +@return The metric name or NULL in case of failure +*/ +char* nvmon_getMetricName(int groupId, int metricId) __attribute__ ((visibility ("default") )); +/*! \brief Get the name group (Nvmon) + +Get the name of group. Either it is the name of the performance group or "Custom" +@param [in] groupId ID of the group that should be read +@return The group name or NULL in case of failure +*/ +char* nvmon_getGroupName(int groupId) __attribute__ ((visibility ("default") )); +/*! \brief Get the short informational string of the specified group (Nvmon) + +Returns the short information string as defined by performance groups or "Custom" +in case of custom event sets +@param [in] groupId ID of the group that should be read +@return The short information or NULL in case of failure +*/ +char* nvmon_getGroupInfoShort(int groupId) __attribute__ ((visibility ("default") )); +/*! \brief Get the long descriptive string of the specified group (Nvmon) + +Returns the long descriptive string as defined by performance groups or NULL +in case of custom event sets +@param [in] groupId ID of the group that should be read +@return The long description or NULL in case of failure +*/ +char* nvmon_getGroupInfoLong(int groupId) __attribute__ ((visibility ("default") )); + +/*! \brief Get all groups (Nvmon) + +Checks the configured performance group path for the current GPU and +returns all found group names +@param [in] gpuId Get groups for a specific GPU +@param [out] groups List of group names +@param [out] shortinfos List of short information string about group +@param [out] longinfos List of long information string about group +@return Amount of found performance groups +*/ +int nvmon_getGroups(int gpuId, char*** groups, char*** shortinfos, char*** longinfos) __attribute__ ((visibility ("default") )); +/*! \brief Free all group information (Nvmon) + +@param [in] nrgroups Number of groups +@param [in] groups List of group names +@param [in] shortinfos List of short information string about group +@param [in] longinfos List of long information string about group +*/ +int nvmon_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos) __attribute__ ((visibility ("default") )); + + + +/** @}*/ + +#endif /* LIKWID_WITH_NVMON */ + +#ifdef __cplusplus +} +#endif + +#endif /*LIKWID_H*/ diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go new file mode 100644 index 0000000..2f12b29 --- /dev/null +++ b/collectors/likwidMetric.go @@ -0,0 +1,174 @@ +package collectors +/* +#cgo CFLAGS: -I./likwid +#cgo LDFLAGS: -L./likwid -llikwid -llikwid-hwloc -lm +#include +#include +*/ +import "C" + +import ( + "errors" +// "io/ioutil" +// "log" +// "strconv" + "strings" + "fmt" + "time" + "unsafe" + "log" + + //protocol "github.com/influxdata/line-protocol" +) + + +type LikwidCollector struct { + MetricCollector + cpulist []C.int + sock2tid map[int]int + metrics map[C.int]map[string]int + groups map[string]C.int + init bool +} + +type LikwidMetric struct { + name string + search string + socket_scope bool + group_idx int +} + +const GROUPPATH = `/home/unrz139/Work/cc-metric-collector/collectors/likwid/groups` + +var likwid_metrics = map[string][]LikwidMetric{ + "MEM_DP": {LikwidMetric{name: "mem_bw", search: "Memory bandwidth [MBytes/s]", socket_scope: true}, + LikwidMetric{name: "pwr1", search: "Power [W]", socket_scope: true}, + LikwidMetric{name: "pwr2", search: "Power DRAM [W]", socket_scope: true}, + LikwidMetric{name: "flops_dp", search: "DP [MFLOP/s]", socket_scope: false}}, + "FLOPS_SP" : {LikwidMetric{name: "clock", search: "Clock [MHz]", socket_scope: false}, + LikwidMetric{name: "cpi", search: "CPI", socket_scope: false}, + LikwidMetric{name: "flops_sp", search: "SP [MFLOP/s]", socket_scope: false}}, +} + + +func getMetricId(group C.int, search string) (int, error) { + for i := 0; i < int(C.perfmon_getNumberOfMetrics(group)); i++ { + mname := C.perfmon_getMetricName(group, C.int(i)) + go_mname := C.GoString(mname) + if strings.Contains(go_mname, search) { + return i, nil + } + + } + return -1, errors.New(fmt.Sprintf("Cannot find metric for search string '%s' in group %d", search, int(group))) +} + +func getSocketCpus() map[C.int]int { + slist := SocketList() + var cpu C.int + outmap := make(map[C.int]int) + for _, s := range slist { + t := C.CString(fmt.Sprintf("S%d", s)) + clen := C.cpustr_to_cpulist(t, &cpu, 1) + if int(clen) == 1 { + outmap[cpu] = s + } + } + return outmap +} + + +func (m *LikwidCollector) Init() { + m.name = "LikwidCollector" + m.setup() + cpulist := CpuList() + m.cpulist = make([]C.int, len(cpulist)) + slist := getSocketCpus() + + m.sock2tid = make(map[int]int) + for i, c := range cpulist { + m.cpulist[i] = C.int(c) + if sid, found := slist[m.cpulist[i]]; found { + m.sock2tid[sid] = i + } + } + m.metrics = make(map[C.int]map[string]int) + C.topology_init() + C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0]) + gpath := C.CString(GROUPPATH) + C.config_setGroupPath(gpath) + C.free(unsafe.Pointer(gpath)) + m.init = true + m.groups = make(map[string]C.int) + for g, metrics := range likwid_metrics { + cstr := C.CString(g) + gid := C.perfmon_addEventSet(cstr) + if gid >= 0 { + m.groups[g] = gid + for i, metric := range metrics { + idx, err := getMetricId(gid, metric.search) + if err != nil { + log.Print(err) + } else { + likwid_metrics[g][i].group_idx = idx + } + } + } else { + log.Print("Failed to add events set ", g) + } + C.free(unsafe.Pointer(cstr)) + } +} + +func (m *LikwidCollector) Read(interval time.Duration){ + if m.init { + for gname, gid := range m.groups { + C.perfmon_setupCounters(gid) + C.perfmon_startCounters() + time.Sleep(interval) + C.perfmon_stopCounters() + + for _, lmetric := range likwid_metrics[gname] { + if lmetric.socket_scope { + for sid, tid := range m.sock2tid { + res := C.perfmon_getLastMetric(gid, C.int(lmetric.group_idx), C.int(tid)) + m.sockets[int(sid)][lmetric.name] = float64(res) +// log.Print("Metric '", lmetric.name,"' on Socket ",int(sid)," returns ", m.sockets[int(sid)][lmetric.name]) + } + } else { + for tid, cpu := range m.cpulist { + res := C.perfmon_getLastMetric(gid, C.int(lmetric.group_idx), C.int(tid)) + m.cpus[int(cpu)][lmetric.name] = float64(res) +// log.Print("Metric '", lmetric.name,"' on CPU ",int(cpu)," returns ", m.cpus[int(cpu)][lmetric.name]) + } + } + } + for cpu, _ := range m.cpus { + if flops_dp, found := m.cpus[cpu]["flops_dp"]; found { + if flops_sp, found := m.cpus[cpu]["flops_sp"]; found { + m.cpus[cpu]["flops_any"] = flops_dp.(float64) + flops_sp.(float64) + } + } + } + for sid, _ := range m.sockets { + if pwr1, found := m.sockets[int(sid)]["pwr1"]; found { + if pwr2, found := m.sockets[int(sid)]["pwr2"]; found { + sum := pwr1.(float64) + pwr2.(float64) + if sum > 0 { + m.sockets[int(sid)]["power"] = sum + } + delete(m.sockets[int(sid)], "pwr2") + } + delete(m.sockets[int(sid)], "pwr1") + } + } + } + } +} + +func (m *LikwidCollector) Close() { + C.perfmon_finalize() + C.topology_finalize() + m.init = false + return +}