mirror of
				https://github.com/ClusterCockpit/cc-metric-collector.git
				synced 2025-10-31 17:05:07 +01:00 
			
		
		
		
	Add likwid collector
This commit is contained in:
		
							
								
								
									
										301
									
								
								collectors/likwid/bstrlib.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										301
									
								
								collectors/likwid/bstrlib.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,301 @@ | ||||
| /* | ||||
|  * ======================================================================================= | ||||
|  * This source file is part of the bstring string library.  This code was | ||||
|  * written by Paul Hsieh in 2002-2008, and is covered by the BSD open source | ||||
|  * license and the GPL. Refer to the accompanying documentation for details | ||||
|  * on usage and license. | ||||
|  */ | ||||
| /* | ||||
|  * bstrlib.c | ||||
|  * | ||||
|  * This file is the core module for implementing the bstring functions. | ||||
|  */ | ||||
|  | ||||
| #ifndef BSTRLIB_INCLUDE | ||||
| #define BSTRLIB_INCLUDE | ||||
|  | ||||
| #ifdef __cplusplus | ||||
| extern "C" { | ||||
| #endif | ||||
|  | ||||
| #include <stdarg.h> | ||||
| #include <string.h> | ||||
| #include <limits.h> | ||||
| #include <ctype.h> | ||||
|  | ||||
| #if !defined (BSTRLIB_VSNP_OK) && !defined (BSTRLIB_NOVSNP) | ||||
| # if defined (__TURBOC__) && !defined (__BORLANDC__) | ||||
| #  define BSTRLIB_NOVSNP | ||||
| # endif | ||||
| #endif | ||||
|  | ||||
| #define BSTR_ERR (-1) | ||||
| #define BSTR_OK (0) | ||||
| #define BSTR_BS_BUFF_LENGTH_GET (0) | ||||
|  | ||||
| typedef struct tagbstring * bstring; | ||||
| typedef const struct tagbstring * const_bstring; | ||||
|  | ||||
| /* Copy functions */ | ||||
| #define cstr2bstr bfromcstr | ||||
| extern bstring bfromcstr (const char * str); | ||||
| extern bstring bfromcstralloc (int mlen, const char * str); | ||||
| extern bstring blk2bstr (const void * blk, int len); | ||||
| extern char * bstr2cstr (const_bstring s, char z); | ||||
| extern int bcstrfree (char * s); | ||||
| extern bstring bstrcpy (const_bstring b1); | ||||
| extern int bassign (bstring a, const_bstring b); | ||||
| extern int bassignmidstr (bstring a, const_bstring b, int left, int len); | ||||
| extern int bassigncstr (bstring a, const char * str); | ||||
| extern int bassignblk (bstring a, const void * s, int len); | ||||
|  | ||||
| /* Destroy function */ | ||||
| extern int bdestroy (bstring b); | ||||
|  | ||||
| /* Space allocation hinting functions */ | ||||
| extern int balloc (bstring s, int len); | ||||
| extern int ballocmin (bstring b, int len); | ||||
|  | ||||
| /* Substring extraction */ | ||||
| extern bstring bmidstr (const_bstring b, int left, int len); | ||||
|  | ||||
| /* Various standard manipulations */ | ||||
| extern int bconcat (bstring b0, const_bstring b1); | ||||
| extern int bconchar (bstring b0, char c); | ||||
| extern int bcatcstr (bstring b, const char * s); | ||||
| extern int bcatblk (bstring b, const void * s, int len); | ||||
| extern int binsert (bstring s1, int pos, const_bstring s2, unsigned char fill); | ||||
| extern int binsertch (bstring s1, int pos, int len, unsigned char fill); | ||||
| extern int breplace (bstring b1, int pos, int len, const_bstring b2, unsigned char fill); | ||||
| extern int bdelete (bstring s1, int pos, int len); | ||||
| extern int bsetstr (bstring b0, int pos, const_bstring b1, unsigned char fill); | ||||
| extern int btrunc (bstring b, int n); | ||||
|  | ||||
| /* Scan/search functions */ | ||||
| extern int bstricmp (const_bstring b0, const_bstring b1); | ||||
| extern int bstrnicmp (const_bstring b0, const_bstring b1, int n); | ||||
| extern int biseqcaseless (const_bstring b0, const_bstring b1); | ||||
| extern int bisstemeqcaselessblk (const_bstring b0, const void * blk, int len); | ||||
| extern int biseq (const_bstring b0, const_bstring b1); | ||||
| extern int bisstemeqblk (const_bstring b0, const void * blk, int len); | ||||
| extern int biseqcstr (const_bstring b, const char * s); | ||||
| extern int biseqcstrcaseless (const_bstring b, const char * s); | ||||
| extern int bstrcmp (const_bstring b0, const_bstring b1); | ||||
| extern int bstrncmp (const_bstring b0, const_bstring b1, int n); | ||||
| extern int binstr (const_bstring s1, int pos, const_bstring s2); | ||||
| extern int binstrr (const_bstring s1, int pos, const_bstring s2); | ||||
| extern int binstrcaseless (const_bstring s1, int pos, const_bstring s2); | ||||
| extern int binstrrcaseless (const_bstring s1, int pos, const_bstring s2); | ||||
| extern int bstrchrp (const_bstring b, int c, int pos); | ||||
| extern int bstrrchrp (const_bstring b, int c, int pos); | ||||
| #define bstrchr(b,c) bstrchrp ((b), (c), 0) | ||||
| #define bstrrchr(b,c) bstrrchrp ((b), (c), blength(b)-1) | ||||
| extern int binchr (const_bstring b0, int pos, const_bstring b1); | ||||
| extern int binchrr (const_bstring b0, int pos, const_bstring b1); | ||||
| extern int bninchr (const_bstring b0, int pos, const_bstring b1); | ||||
| extern int bninchrr (const_bstring b0, int pos, const_bstring b1); | ||||
| extern int bfindreplace (bstring b, const_bstring find, const_bstring repl, int pos); | ||||
| extern int bfindreplacecaseless (bstring b, const_bstring find, const_bstring repl, int pos); | ||||
|  | ||||
| /* List of string container functions */ | ||||
| struct bstrList { | ||||
|     int qty, mlen; | ||||
|     bstring * entry; | ||||
| }; | ||||
| extern struct bstrList * bstrListCreate (void); | ||||
| extern int bstrListDestroy (struct bstrList * sl); | ||||
| extern int bstrListAlloc (struct bstrList * sl, int msz); | ||||
| extern int bstrListAllocMin (struct bstrList * sl, int msz); | ||||
|  | ||||
| /* String split and join functions */ | ||||
| extern struct bstrList * bsplit (const_bstring str, unsigned char splitChar); | ||||
| extern struct bstrList * bsplits (const_bstring str, const_bstring splitStr); | ||||
| extern struct bstrList * bsplitstr (const_bstring str, const_bstring splitStr); | ||||
| extern bstring bjoin (const struct bstrList * bl, const_bstring sep); | ||||
| extern int bsplitcb (const_bstring str, unsigned char splitChar, int pos, | ||||
|     int (* cb) (void * parm, int ofs, int len), void * parm); | ||||
| extern int bsplitscb (const_bstring str, const_bstring splitStr, int pos, | ||||
|     int (* cb) (void * parm, int ofs, int len), void * parm); | ||||
| extern int bsplitstrcb (const_bstring str, const_bstring splitStr, int pos, | ||||
|     int (* cb) (void * parm, int ofs, int len), void * parm); | ||||
|  | ||||
| /* Miscellaneous functions */ | ||||
| extern int bpattern (bstring b, int len); | ||||
| extern int btoupper (bstring b); | ||||
| extern int btolower (bstring b); | ||||
| extern int bltrimws (bstring b); | ||||
| extern int brtrimws (bstring b); | ||||
| extern int btrimws (bstring b); | ||||
|  | ||||
| #if !defined (BSTRLIB_NOVSNP) | ||||
| extern bstring bformat (const char * fmt, ...); | ||||
| extern int bformata (bstring b, const char * fmt, ...); | ||||
| extern int bassignformat (bstring b, const char * fmt, ...); | ||||
| extern int bvcformata (bstring b, int count, const char * fmt, va_list arglist); | ||||
|  | ||||
| #define bvformata(ret, b, fmt, lastarg) { \ | ||||
| bstring bstrtmp_b = (b); \ | ||||
| const char * bstrtmp_fmt = (fmt); \ | ||||
| int bstrtmp_r = BSTR_ERR, bstrtmp_sz = 16; \ | ||||
|     for (;;) { \ | ||||
|         va_list bstrtmp_arglist; \ | ||||
|         va_start (bstrtmp_arglist, lastarg); \ | ||||
|         bstrtmp_r = bvcformata (bstrtmp_b, bstrtmp_sz, bstrtmp_fmt, bstrtmp_arglist); \ | ||||
|         va_end (bstrtmp_arglist); \ | ||||
|         if (bstrtmp_r >= 0) { /* Everything went ok */ \ | ||||
|             bstrtmp_r = BSTR_OK; \ | ||||
|             break; \ | ||||
|         } else if (-bstrtmp_r <= bstrtmp_sz) { /* A real error? */ \ | ||||
|             bstrtmp_r = BSTR_ERR; \ | ||||
|             break; \ | ||||
|         } \ | ||||
|         bstrtmp_sz = -bstrtmp_r; /* Doubled or target size */ \ | ||||
|     } \ | ||||
|     ret = bstrtmp_r; \ | ||||
| } | ||||
|  | ||||
| #endif | ||||
|  | ||||
| typedef int (*bNgetc) (void *parm); | ||||
| typedef size_t (* bNread) (void *buff, size_t elsize, size_t nelem, void *parm); | ||||
|  | ||||
| /* Input functions */ | ||||
| extern bstring bgets (bNgetc getcPtr, void * parm, char terminator); | ||||
| extern bstring bread (bNread readPtr, void * parm); | ||||
| extern int bgetsa (bstring b, bNgetc getcPtr, void * parm, char terminator); | ||||
| extern int bassigngets (bstring b, bNgetc getcPtr, void * parm, char terminator); | ||||
| extern int breada (bstring b, bNread readPtr, void * parm); | ||||
|  | ||||
| /* Stream functions */ | ||||
| extern struct bStream * bsopen (bNread readPtr, void * parm); | ||||
| extern void * bsclose (struct bStream * s); | ||||
| extern int bsbufflength (struct bStream * s, int sz); | ||||
| extern int bsreadln (bstring b, struct bStream * s, char terminator); | ||||
| extern int bsreadlns (bstring r, struct bStream * s, const_bstring term); | ||||
| extern int bsread (bstring b, struct bStream * s, int n); | ||||
| extern int bsreadlna (bstring b, struct bStream * s, char terminator); | ||||
| extern int bsreadlnsa (bstring r, struct bStream * s, const_bstring term); | ||||
| extern int bsreada (bstring b, struct bStream * s, int n); | ||||
| extern int bsunread (struct bStream * s, const_bstring b); | ||||
| extern int bspeek (bstring r, const struct bStream * s); | ||||
| extern int bssplitscb (struct bStream * s, const_bstring splitStr, | ||||
|     int (* cb) (void * parm, int ofs, const_bstring entry), void * parm); | ||||
| extern int bssplitstrcb (struct bStream * s, const_bstring splitStr, | ||||
|     int (* cb) (void * parm, int ofs, const_bstring entry), void * parm); | ||||
| extern int bseof (const struct bStream * s); | ||||
|  | ||||
| struct tagbstring { | ||||
|     int mlen; | ||||
|     int slen; | ||||
|     unsigned char * data; | ||||
| }; | ||||
|  | ||||
| /* Accessor macros */ | ||||
| #define blengthe(b, e)      (((b) == (void *)0 || (b)->slen < 0) ? (int)(e) : ((b)->slen)) | ||||
| #define blength(b)          (blengthe ((b), 0)) | ||||
| #define bdataofse(b, o, e)  (((b) == (void *)0 || (b)->data == (void*)0) ? (char *)(e) : ((char *)(b)->data) + (o)) | ||||
| #define bdataofs(b, o)      (bdataofse ((b), (o), (void *)0)) | ||||
| #define bdatae(b, e)        (bdataofse (b, 0, e)) | ||||
| #define bdata(b)            (bdataofs (b, 0)) | ||||
| #define bchare(b, p, e)     ((((unsigned)(p)) < (unsigned)blength(b)) ? ((b)->data[(p)]) : (e)) | ||||
| #define bchar(b, p)         bchare ((b), (p), '\0') | ||||
|  | ||||
| /* Static constant string initialization macro */ | ||||
| #define bsStaticMlen(q,m)   {(m), (int) sizeof(q)-1, (unsigned char *) ("" q "")} | ||||
| #if defined(_MSC_VER) | ||||
| # define bsStatic(q)        bsStaticMlen(q,-32) | ||||
| #endif | ||||
| #ifndef bsStatic | ||||
| # define bsStatic(q)        bsStaticMlen(q,-__LINE__) | ||||
| #endif | ||||
|  | ||||
| /* Static constant block parameter pair */ | ||||
| #define bsStaticBlkParms(q) ((void *)("" q "")), ((int) sizeof(q)-1) | ||||
|  | ||||
| /* Reference building macros */ | ||||
| #define cstr2tbstr btfromcstr | ||||
| #define btfromcstr(t,s) {                                            \ | ||||
|     (t).data = (unsigned char *) (s);                                \ | ||||
|     (t).slen = ((t).data) ? ((int) (strlen) ((char *)(t).data)) : 0; \ | ||||
|     (t).mlen = -1;                                                   \ | ||||
| } | ||||
| #define blk2tbstr(t,s,l) {            \ | ||||
|     (t).data = (unsigned char *) (s); \ | ||||
|     (t).slen = l;                     \ | ||||
|     (t).mlen = -1;                    \ | ||||
| } | ||||
| #define btfromblk(t,s,l) blk2tbstr(t,s,l) | ||||
| #define bmid2tbstr(t,b,p,l) {                                                \ | ||||
|     const_bstring bstrtmp_s = (b);                                           \ | ||||
|     if (bstrtmp_s && bstrtmp_s->data && bstrtmp_s->slen >= 0) {              \ | ||||
|         int bstrtmp_left = (p);                                              \ | ||||
|         int bstrtmp_len  = (l);                                              \ | ||||
|         if (bstrtmp_left < 0) {                                              \ | ||||
|             bstrtmp_len += bstrtmp_left;                                     \ | ||||
|             bstrtmp_left = 0;                                                \ | ||||
|         }                                                                    \ | ||||
|         if (bstrtmp_len > bstrtmp_s->slen - bstrtmp_left)                    \ | ||||
|             bstrtmp_len = bstrtmp_s->slen - bstrtmp_left;                    \ | ||||
|         if (bstrtmp_len <= 0) {                                              \ | ||||
|             (t).data = (unsigned char *)"";                                  \ | ||||
|             (t).slen = 0;                                                    \ | ||||
|         } else {                                                             \ | ||||
|             (t).data = bstrtmp_s->data + bstrtmp_left;                       \ | ||||
|             (t).slen = bstrtmp_len;                                          \ | ||||
|         }                                                                    \ | ||||
|     } else {                                                                 \ | ||||
|         (t).data = (unsigned char *)"";                                      \ | ||||
|         (t).slen = 0;                                                        \ | ||||
|     }                                                                        \ | ||||
|     (t).mlen = -__LINE__;                                                    \ | ||||
| } | ||||
| #define btfromblkltrimws(t,s,l) {                                            \ | ||||
|     int bstrtmp_idx = 0, bstrtmp_len = (l);                                  \ | ||||
|     unsigned char * bstrtmp_s = (s);                                         \ | ||||
|     if (bstrtmp_s && bstrtmp_len >= 0) {                                     \ | ||||
|         for (; bstrtmp_idx < bstrtmp_len; bstrtmp_idx++) {                   \ | ||||
|             if (!isspace (bstrtmp_s[bstrtmp_idx])) break;                    \ | ||||
|         }                                                                    \ | ||||
|     }                                                                        \ | ||||
|     (t).data = bstrtmp_s + bstrtmp_idx;                                      \ | ||||
|     (t).slen = bstrtmp_len - bstrtmp_idx;                                    \ | ||||
|     (t).mlen = -__LINE__;                                                    \ | ||||
| } | ||||
| #define btfromblkrtrimws(t,s,l) {                                            \ | ||||
|     int bstrtmp_len = (l) - 1;                                               \ | ||||
|     unsigned char * bstrtmp_s = (s);                                         \ | ||||
|     if (bstrtmp_s && bstrtmp_len >= 0) {                                     \ | ||||
|         for (; bstrtmp_len >= 0; bstrtmp_len--) {                            \ | ||||
|             if (!isspace (bstrtmp_s[bstrtmp_len])) break;                    \ | ||||
|         }                                                                    \ | ||||
|     }                                                                        \ | ||||
|     (t).data = bstrtmp_s;                                                    \ | ||||
|     (t).slen = bstrtmp_len + 1;                                              \ | ||||
|     (t).mlen = -__LINE__;                                                    \ | ||||
| } | ||||
| #define btfromblktrimws(t,s,l) {                                             \ | ||||
|     int bstrtmp_idx = 0, bstrtmp_len = (l) - 1;                              \ | ||||
|     unsigned char * bstrtmp_s = (s);                                         \ | ||||
|     if (bstrtmp_s && bstrtmp_len >= 0) {                                     \ | ||||
|         for (; bstrtmp_idx <= bstrtmp_len; bstrtmp_idx++) {                  \ | ||||
|             if (!isspace (bstrtmp_s[bstrtmp_idx])) break;                    \ | ||||
|         }                                                                    \ | ||||
|         for (; bstrtmp_len >= bstrtmp_idx; bstrtmp_len--) {                  \ | ||||
|             if (!isspace (bstrtmp_s[bstrtmp_len])) break;                    \ | ||||
|         }                                                                    \ | ||||
|     }                                                                        \ | ||||
|     (t).data = bstrtmp_s + bstrtmp_idx;                                      \ | ||||
|     (t).slen = bstrtmp_len + 1 - bstrtmp_idx;                                \ | ||||
|     (t).mlen = -__LINE__;                                                    \ | ||||
| } | ||||
|  | ||||
| /* Write protection macros */ | ||||
| #define bwriteprotect(t)     { if ((t).mlen >=  0) (t).mlen = -1; } | ||||
| #define bwriteallow(t)       { if ((t).mlen == -1) (t).mlen = (t).slen + ((t).slen == 0); } | ||||
| #define biswriteprotected(t) ((t).mlen <= 0) | ||||
|  | ||||
| #ifdef __cplusplus | ||||
| } | ||||
| #endif | ||||
|  | ||||
| #endif | ||||
							
								
								
									
										31
									
								
								collectors/likwid/groups/CLX/BRANCH.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										31
									
								
								collectors/likwid/groups/CLX/BRANCH.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,31 @@ | ||||
| SHORT Branch prediction miss rate/ratio | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  BR_INST_RETIRED_ALL_BRANCHES | ||||
| PMC1  BR_MISP_RETIRED_ALL_BRANCHES | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Branch rate   PMC0/FIXC0 | ||||
| Branch misprediction rate  PMC1/FIXC0 | ||||
| Branch misprediction ratio  PMC1/PMC0 | ||||
| Instructions per branch  FIXC0/PMC0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY | ||||
| Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY | ||||
| Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES | ||||
| Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES | ||||
| - | ||||
| The rates state how often on average a branch or a mispredicted branch occurred | ||||
| per instruction retired in total. The branch misprediction ratio sets directly | ||||
| into relation what ratio of all branch instruction where mispredicted. | ||||
| Instructions per branch is 1/branch rate. | ||||
|  | ||||
							
								
								
									
										143
									
								
								collectors/likwid/groups/CLX/CACHES.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										143
									
								
								collectors/likwid/groups/CLX/CACHES.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,143 @@ | ||||
| SHORT Cache bandwidth in MBytes/s | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  L1D_REPLACEMENT | ||||
| PMC1  L1D_M_EVICT | ||||
| PMC2  L2_LINES_IN_ALL | ||||
| PMC3  L2_TRANS_L2_WB | ||||
| CBOX0C1 LLC_VICTIMS_M_STATE | ||||
| CBOX1C1 LLC_VICTIMS_M_STATE | ||||
| CBOX2C1 LLC_VICTIMS_M_STATE | ||||
| CBOX3C1 LLC_VICTIMS_M_STATE | ||||
| CBOX4C1 LLC_VICTIMS_M_STATE | ||||
| CBOX5C1 LLC_VICTIMS_M_STATE | ||||
| CBOX6C1 LLC_VICTIMS_M_STATE | ||||
| CBOX7C1 LLC_VICTIMS_M_STATE | ||||
| CBOX8C1 LLC_VICTIMS_M_STATE | ||||
| CBOX9C1 LLC_VICTIMS_M_STATE | ||||
| CBOX10C1 LLC_VICTIMS_M_STATE | ||||
| CBOX11C1 LLC_VICTIMS_M_STATE | ||||
| CBOX12C1 LLC_VICTIMS_M_STATE | ||||
| CBOX13C1 LLC_VICTIMS_M_STATE | ||||
| CBOX14C1 LLC_VICTIMS_M_STATE | ||||
| CBOX15C1 LLC_VICTIMS_M_STATE | ||||
| CBOX16C1 LLC_VICTIMS_M_STATE | ||||
| CBOX17C1 LLC_VICTIMS_M_STATE | ||||
| CBOX18C1 LLC_VICTIMS_M_STATE | ||||
| CBOX19C1 LLC_VICTIMS_M_STATE | ||||
| CBOX20C1 LLC_VICTIMS_M_STATE | ||||
| CBOX21C1 LLC_VICTIMS_M_STATE | ||||
| CBOX22C1 LLC_VICTIMS_M_STATE | ||||
| CBOX23C1 LLC_VICTIMS_M_STATE | ||||
| CBOX24C1 LLC_VICTIMS_M_STATE | ||||
| CBOX25C1 LLC_VICTIMS_M_STATE | ||||
| CBOX26C1 LLC_VICTIMS_M_STATE | ||||
| CBOX27C1 LLC_VICTIMS_M_STATE | ||||
| CBOX0C0 LLC_LOOKUP_DATA_READ | ||||
| CBOX1C0 LLC_LOOKUP_DATA_READ | ||||
| CBOX2C0 LLC_LOOKUP_DATA_READ | ||||
| CBOX3C0 LLC_LOOKUP_DATA_READ | ||||
| CBOX4C0 LLC_LOOKUP_DATA_READ | ||||
| CBOX5C0 LLC_LOOKUP_DATA_READ | ||||
| CBOX6C0 LLC_LOOKUP_DATA_READ | ||||
| CBOX7C0 LLC_LOOKUP_DATA_READ | ||||
| CBOX8C0 LLC_LOOKUP_DATA_READ | ||||
| CBOX9C0 LLC_LOOKUP_DATA_READ | ||||
| CBOX10C0 LLC_LOOKUP_DATA_READ | ||||
| CBOX11C0 LLC_LOOKUP_DATA_READ | ||||
| CBOX12C0 LLC_LOOKUP_DATA_READ | ||||
| CBOX13C0 LLC_LOOKUP_DATA_READ | ||||
| CBOX14C0 LLC_LOOKUP_DATA_READ | ||||
| CBOX15C0 LLC_LOOKUP_DATA_READ | ||||
| CBOX16C0 LLC_LOOKUP_DATA_READ | ||||
| CBOX17C0 LLC_LOOKUP_DATA_READ | ||||
| CBOX18C0 LLC_LOOKUP_DATA_READ | ||||
| CBOX19C0 LLC_LOOKUP_DATA_READ | ||||
| CBOX20C0 LLC_LOOKUP_DATA_READ | ||||
| CBOX21C0 LLC_LOOKUP_DATA_READ | ||||
| CBOX22C0 LLC_LOOKUP_DATA_READ | ||||
| CBOX23C0 LLC_LOOKUP_DATA_READ | ||||
| CBOX24C0 LLC_LOOKUP_DATA_READ | ||||
| CBOX25C0 LLC_LOOKUP_DATA_READ | ||||
| CBOX26C0 LLC_LOOKUP_DATA_READ | ||||
| CBOX27C0 LLC_LOOKUP_DATA_READ | ||||
| MBOX0C0 CAS_COUNT_RD | ||||
| MBOX0C1 CAS_COUNT_WR | ||||
| MBOX1C0 CAS_COUNT_RD | ||||
| MBOX1C1 CAS_COUNT_WR | ||||
| MBOX2C0 CAS_COUNT_RD | ||||
| MBOX2C1 CAS_COUNT_WR | ||||
| MBOX3C0 CAS_COUNT_RD | ||||
| MBOX3C1 CAS_COUNT_WR | ||||
| MBOX4C0 CAS_COUNT_RD | ||||
| MBOX4C1 CAS_COUNT_WR | ||||
| MBOX5C0 CAS_COUNT_RD | ||||
| MBOX5C1 CAS_COUNT_WR | ||||
|  | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time | ||||
| L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0 | ||||
| L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time | ||||
| L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0 | ||||
| L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time | ||||
| L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 | ||||
| L3 to L2 load bandwidth [MBytes/s]  1.0E-06*PMC2*64.0/time | ||||
| L3 to L2 load data volume [GBytes]  1.0E-09*PMC2*64.0 | ||||
| L2 to L3 evict bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time | ||||
| L2 to L3 evict data volume [GBytes]  1.0E-09*PMC3*64.0 | ||||
| L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time | ||||
| L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0 | ||||
| System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0)*64.0/time | ||||
| System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0)*64.0 | ||||
| L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64/time | ||||
| L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64 | ||||
| L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64.0/time | ||||
| L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64.0 | ||||
| Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time | ||||
| Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0 | ||||
| Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time | ||||
| Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 | ||||
| Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time | ||||
| Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time | ||||
| L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64 | ||||
| L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time | ||||
| L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64 | ||||
| L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time | ||||
| L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64 | ||||
| L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time | ||||
| L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64 | ||||
| L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time | ||||
| L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64 | ||||
| L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time | ||||
| L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64 | ||||
| System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ))*64/time | ||||
| System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ))*64 | ||||
| L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M_STATE))*64/time | ||||
| L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M_STATE))*64 | ||||
| L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M_STATE))*64/time | ||||
| L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M_STATE))*64 | ||||
| Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time | ||||
| Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0 | ||||
| Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time | ||||
| Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0 | ||||
| Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time | ||||
| Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0 | ||||
| - | ||||
| Group to measure cache transfers between L1 and Memory. Please notice that the | ||||
| L3 to/from system metrics contain any traffic to the system (memory, | ||||
| Intel QPI, etc.) but don't seem to handle anything because commonly memory read | ||||
| bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth. | ||||
|  | ||||
							
								
								
									
										26
									
								
								collectors/likwid/groups/CLX/CLOCK.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								collectors/likwid/groups/CLX/CLOCK.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,26 @@ | ||||
| SHORT Power and Energy consumption | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PWR0  PWR_PKG_ENERGY | ||||
| UBOXFIX UNCORE_CLOCK | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| Uncore Clock [MHz] 1.E-06*UBOXFIX/time | ||||
| CPI  FIXC1/FIXC0 | ||||
| Energy [J]  PWR0 | ||||
| Power [W] PWR0/time | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Power =  PWR_PKG_ENERGY / time | ||||
| Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time | ||||
| - | ||||
| Broadwell implements the new RAPL interface. This interface enables to | ||||
| monitor the consumed energy on the package (socket) level. | ||||
|  | ||||
							
								
								
									
										38
									
								
								collectors/likwid/groups/CLX/CYCLE_ACTIVITY.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										38
									
								
								collectors/likwid/groups/CLX/CYCLE_ACTIVITY.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,38 @@ | ||||
| SHORT Cycle Activities | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING | ||||
| PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING | ||||
| PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING | ||||
| PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Cycles without execution [%] (PMC3/FIXC1)*100 | ||||
| Cycles without execution due to L1D [%] (PMC2/FIXC1)*100 | ||||
| Cycles without execution due to L2 [%] (PMC0/FIXC1)*100 | ||||
| Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100 | ||||
| Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100 | ||||
| Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100 | ||||
| Cycles without execution due to memory loads [%] = CYCLE_ACTIVITY_CYCLES_LDM_PENDING/CPU_CLK_UNHALTED_CORE*100 | ||||
| -- | ||||
| This performance group measures the cycles while waiting for data from the cache | ||||
| and memory hierarchy. | ||||
| CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on | ||||
| any execution port. | ||||
| CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is | ||||
| outstanding. | ||||
| CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is | ||||
| outstanding. | ||||
| CYCLE_ACTIVITY_CYCLES_LDM_PENDING: Cycles while memory subsystem has an | ||||
| outstanding load. | ||||
							
								
								
									
										45
									
								
								collectors/likwid/groups/CLX/CYCLE_STALLS.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								collectors/likwid/groups/CLX/CYCLE_STALLS.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,45 @@ | ||||
| SHORT Cycle Activities (Stalls) | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING | ||||
| PMC1 CYCLE_ACTIVITY_STALLS_LDM_PENDING | ||||
| PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING | ||||
| PMC3 CYCLE_ACTIVITY_STALLS_TOTAL | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Total execution stalls PMC3 | ||||
| Stalls caused by L1D misses [%] (PMC2/PMC3)*100 | ||||
| Stalls caused by L2 misses [%] (PMC0/PMC3)*100 | ||||
| Stalls caused by memory loads [%] (PMC1/PMC3)*100 | ||||
| Execution stall rate [%] (PMC3/FIXC1)*100 | ||||
| Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100 | ||||
| Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100 | ||||
| Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL | ||||
| Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 | ||||
| Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 | ||||
| Stalls caused by memory loads [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 | ||||
| Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100 | ||||
| Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100 | ||||
| Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100 | ||||
| Stalls caused by memory loads rate [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CPU_CLK_UNHALTED_CORE)*100 | ||||
| -- | ||||
| This performance group measures the stalls caused by data traffic in the cache | ||||
| hierarchy. | ||||
| CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls. | ||||
| CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand | ||||
| load is outstanding. | ||||
| CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand | ||||
| load is outstanding. | ||||
| CYCLE_ACTIVITY_STALLS_LDM_PENDING: Execution stalls while memory subsystem has | ||||
| an outstanding load. | ||||
							
								
								
									
										22
									
								
								collectors/likwid/groups/CLX/DATA.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								collectors/likwid/groups/CLX/DATA.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,22 @@ | ||||
| SHORT Load to store ratio | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  MEM_INST_RETIRED_ALL_LOADS | ||||
| PMC1  MEM_INST_RETIRED_ALL_STORES | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Load to store ratio PMC0/PMC1 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Load to store ratio = MEM_INST_RETIRED_ALL_LOADS/MEM_INST_RETIRED_ALL_STORES | ||||
| - | ||||
| This is a metric to determine your load to store ratio. | ||||
|  | ||||
							
								
								
									
										24
									
								
								collectors/likwid/groups/CLX/DIVIDE.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								collectors/likwid/groups/CLX/DIVIDE.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,24 @@ | ||||
| SHORT Divide unit information | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  ARITH_DIVIDER_COUNT | ||||
| PMC1  ARITH_DIVIDER_ACTIVE | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Number of divide ops PMC0 | ||||
| Avg. divide unit usage duration PMC1/PMC0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Number of divide ops = ARITH_DIVIDER_COUNT | ||||
| Avg. divide unit usage duration = ARITH_DIVIDER_ACTIVE/ARITH_DIVIDER_COUNT | ||||
| -- | ||||
| This performance group measures the average latency of divide operations | ||||
							
								
								
									
										35
									
								
								collectors/likwid/groups/CLX/ENERGY.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								collectors/likwid/groups/CLX/ENERGY.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,35 @@ | ||||
| SHORT Power and Energy consumption | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| TMP0  TEMP_CORE | ||||
| PWR0  PWR_PKG_ENERGY | ||||
| PWR1  PWR_PP0_ENERGY | ||||
| PWR3  PWR_DRAM_ENERGY | ||||
|  | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Temperature [C]  TMP0 | ||||
| Energy [J]  PWR0 | ||||
| Power [W] PWR0/time | ||||
| Energy PP0 [J]  PWR1 | ||||
| Power PP0 [W] PWR1/time | ||||
| Energy DRAM [J]  PWR3 | ||||
| Power DRAM [W] PWR3/time | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Power = PWR_PKG_ENERGY / time | ||||
| Power PP0 = PWR_PP0_ENERGY / time | ||||
| Power DRAM = PWR_DRAM_ENERGY / time | ||||
| - | ||||
| Broadwell implements the new RAPL interface. This interface enables to | ||||
| monitor the consumed energy on the package (socket)  and DRAM level. | ||||
|  | ||||
							
								
								
									
										25
									
								
								collectors/likwid/groups/CLX/FLOPS_AVX.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								collectors/likwid/groups/CLX/FLOPS_AVX.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,25 @@ | ||||
| SHORT Packed AVX MFLOP/s | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | ||||
| PMC1  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | ||||
| PMC2  FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | ||||
| PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0+PMC2*16.0)/time | ||||
| Packed DP [MFLOP/s]  1.0E-06*(PMC1*4.0+PMC3*8.0)/time | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime | ||||
| Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime | ||||
| - | ||||
| Packed 32b AVX FLOPs rates. | ||||
							
								
								
									
										34
									
								
								collectors/likwid/groups/CLX/FLOPS_DP.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										34
									
								
								collectors/likwid/groups/CLX/FLOPS_DP.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,34 @@ | ||||
| SHORT Double Precision MFLOP/s | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | ||||
| PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | ||||
| PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | ||||
| PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time | ||||
| AVX DP [MFLOP/s]  1.0E-06*(PMC2*4.0+PMC3*8.0)/time | ||||
| AVX512 DP [MFLOP/s]  1.0E-06*(PMC3*8.0)/time | ||||
| Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time | ||||
| Scalar [MUOPS/s] 1.0E-06*PMC1/time | ||||
| Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3) | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime | ||||
| AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime | ||||
| AVX512 DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime | ||||
| Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime | ||||
| Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime | ||||
| Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE) | ||||
| - | ||||
| SSE scalar and packed double precision FLOP rates. | ||||
|  | ||||
							
								
								
									
										34
									
								
								collectors/likwid/groups/CLX/FLOPS_SP.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										34
									
								
								collectors/likwid/groups/CLX/FLOPS_SP.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,34 @@ | ||||
| SHORT Single Precision MFLOP/s | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | ||||
| PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE | ||||
| PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | ||||
| PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time | ||||
| AVX SP [MFLOP/s]  1.0E-06*(PMC2*8.0+PMC3*16.0)/time | ||||
| AVX512 SP [MFLOP/s]  1.0E-06*(PMC3*16.0)/time | ||||
| Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time | ||||
| Scalar [MUOPS/s] 1.0E-06*PMC1/time | ||||
| Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3) | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime | ||||
| AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime | ||||
| AVX512 SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime | ||||
| Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime | ||||
| Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime | ||||
| Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE) | ||||
| - | ||||
| SSE scalar and packed single precision FLOP rates. | ||||
|  | ||||
							
								
								
									
										38
									
								
								collectors/likwid/groups/CLX/L2.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										38
									
								
								collectors/likwid/groups/CLX/L2.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,38 @@ | ||||
| SHORT L2 cache bandwidth in MBytes/s | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  L1D_REPLACEMENT | ||||
| PMC1  L1D_M_EVICT | ||||
| PMC2  ICACHE_64B_IFTAG_MISS | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time | ||||
| L2D load data volume [GBytes]  1.0E-09*PMC0*64.0 | ||||
| L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time | ||||
| L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0 | ||||
| L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time | ||||
| L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time | ||||
| L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0 | ||||
| L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time | ||||
| L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0 | ||||
| L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_64B_IFTAG_MISS)*64/time | ||||
| L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_64B_IFTAG_MISS)*64 | ||||
| - | ||||
| Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the | ||||
| number of cache line allocated in the L1 and the number of modified cache lines | ||||
| evicted from the L1. The group also output total data volume transferred between | ||||
| L2 and L1. Note that this bandwidth also includes data transfers due to a write | ||||
| allocate load on a store miss in L1 and traffic caused by misses in the | ||||
| L1 instruction cache. | ||||
|  | ||||
							
								
								
									
										34
									
								
								collectors/likwid/groups/CLX/L2CACHE.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										34
									
								
								collectors/likwid/groups/CLX/L2CACHE.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,34 @@ | ||||
| SHORT L2 cache miss rate/ratio | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  L2_TRANS_ALL_REQUESTS | ||||
| PMC1  L2_RQSTS_MISS | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| L2 request rate PMC0/FIXC0 | ||||
| L2 miss rate PMC1/FIXC0 | ||||
| L2 miss ratio PMC1/PMC0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY | ||||
| L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY | ||||
| L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS | ||||
| - | ||||
| This group measures the locality of your data accesses with regard to the | ||||
| L2 cache. L2 request rate tells you how data intensive your code is | ||||
| or how many data accesses you have on average per instruction. | ||||
| The L2 miss rate gives a measure how often it was necessary to get | ||||
| cache lines from memory. And finally L2 miss ratio tells you how many of your | ||||
| memory references required a cache line to be loaded from a higher level. | ||||
| While the# data cache miss rate might be given by your algorithm you should | ||||
| try to get data cache miss ratio as low as possible by increasing your cache reuse. | ||||
|  | ||||
|  | ||||
							
								
								
									
										36
									
								
								collectors/likwid/groups/CLX/L3.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										36
									
								
								collectors/likwid/groups/CLX/L3.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,36 @@ | ||||
| SHORT  L3 cache bandwidth in MBytes/s | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  L2_LINES_IN_ALL | ||||
| PMC1  L2_TRANS_L2_WB | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time | ||||
| L3 load data volume [GBytes]  1.0E-09*PMC0*64.0 | ||||
| L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time | ||||
| L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0 | ||||
| L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time | ||||
| L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time | ||||
| L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0 | ||||
| L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time | ||||
| L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0 | ||||
| L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time | ||||
| L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64 | ||||
| - | ||||
| Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the | ||||
| number of cache line allocated in the L2 and the number of modified cache lines | ||||
| evicted from the L2. This group also output data volume transferred between the | ||||
| L3 and measured cores L2 caches. Note that this bandwidth also includes data | ||||
| transfers due to a write allocate load on a store miss in L2. | ||||
|  | ||||
							
								
								
									
										35
									
								
								collectors/likwid/groups/CLX/L3CACHE.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								collectors/likwid/groups/CLX/L3CACHE.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,35 @@ | ||||
| SHORT L3 cache miss rate/ratio | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  MEM_LOAD_RETIRED_L3_HIT | ||||
| PMC1  MEM_LOAD_RETIRED_L3_MISS | ||||
| PMC2  UOPS_RETIRED_ALL | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| L3 request rate (PMC0+PMC1)/PMC2 | ||||
| L3 miss rate PMC1/PMC2 | ||||
| L3 miss ratio PMC1/(PMC0+PMC1) | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| L3 request rate = (MEM_LOAD_RETIRED_L3_HIT+MEM_LOAD_RETIRED_L3_MISS)/UOPS_RETIRED_ALL | ||||
| L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL | ||||
| L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/(MEM_LOAD_RETIRED_L3_HIT+MEM_LOAD_RETIRED_L3_MISS) | ||||
| - | ||||
| This group measures the locality of your data accesses with regard to the | ||||
| L3 cache. L3 request rate tells you how data intensive your code is | ||||
| or how many data accesses you have on average per instruction. | ||||
| The L3 miss rate gives a measure how often it was necessary to get | ||||
| cache lines from memory. And finally L3 miss ratio tells you how many of your | ||||
| memory references required a cache line to be loaded from a higher level. | ||||
| While the data cache miss rate might be given by your algorithm you should | ||||
| try to get data cache miss ratio as low as possible by increasing your cache reuse. | ||||
|  | ||||
|  | ||||
							
								
								
									
										48
									
								
								collectors/likwid/groups/CLX/MEM.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								collectors/likwid/groups/CLX/MEM.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,48 @@ | ||||
| SHORT Main memory bandwidth in MBytes/s | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| MBOX0C0 CAS_COUNT_RD | ||||
| MBOX0C1 CAS_COUNT_WR | ||||
| MBOX1C0 CAS_COUNT_RD | ||||
| MBOX1C1 CAS_COUNT_WR | ||||
| MBOX2C0 CAS_COUNT_RD | ||||
| MBOX2C1 CAS_COUNT_WR | ||||
| MBOX3C0 CAS_COUNT_RD | ||||
| MBOX3C1 CAS_COUNT_WR | ||||
| MBOX4C0 CAS_COUNT_RD | ||||
| MBOX4C1 CAS_COUNT_WR | ||||
| MBOX5C0 CAS_COUNT_RD | ||||
| MBOX5C1 CAS_COUNT_WR | ||||
|  | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time | ||||
| Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0 | ||||
| Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time | ||||
| Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 | ||||
| Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time | ||||
| Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime | ||||
| Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0 | ||||
| Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime | ||||
| Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0 | ||||
| Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime | ||||
| Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0 | ||||
| - | ||||
| Profiling group to measure memory bandwidth drawn by all cores of a socket. | ||||
| Since this group is based on Uncore events it is only possible to measure on a | ||||
| per socket base. Some of the counters may not be available on your system. | ||||
| Also outputs total data volume transferred from main memory. | ||||
| The same metrics are provided by the HA group. | ||||
|  | ||||
							
								
								
									
										70
									
								
								collectors/likwid/groups/CLX/MEM_DP.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										70
									
								
								collectors/likwid/groups/CLX/MEM_DP.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,70 @@ | ||||
| SHORT Overview of arithmetic and main memory performance | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PWR0  PWR_PKG_ENERGY | ||||
| PWR3  PWR_DRAM_ENERGY | ||||
| PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | ||||
| PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | ||||
| PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | ||||
| PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | ||||
| MBOX0C0 CAS_COUNT_RD | ||||
| MBOX0C1 CAS_COUNT_WR | ||||
| MBOX1C0 CAS_COUNT_RD | ||||
| MBOX1C1 CAS_COUNT_WR | ||||
| MBOX2C0 CAS_COUNT_RD | ||||
| MBOX2C1 CAS_COUNT_WR | ||||
| MBOX3C0 CAS_COUNT_RD | ||||
| MBOX3C1 CAS_COUNT_WR | ||||
| MBOX4C0 CAS_COUNT_RD | ||||
| MBOX4C1 CAS_COUNT_WR | ||||
| MBOX5C0 CAS_COUNT_RD | ||||
| MBOX5C1 CAS_COUNT_WR | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Energy [J]  PWR0 | ||||
| Power [W] PWR0/time | ||||
| Energy DRAM [J]  PWR3 | ||||
| Power DRAM [W] PWR3/time | ||||
| DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time | ||||
| AVX DP [MFLOP/s]  1.0E-06*(PMC2*4.0+PMC3*8.0)/time | ||||
| Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time | ||||
| Scalar [MUOPS/s] 1.0E-06*PMC1/time | ||||
| Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time | ||||
| Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0 | ||||
| Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time | ||||
| Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 | ||||
| Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time | ||||
| Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 | ||||
| Operational intensity (PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0) | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Power [W] = PWR_PKG_ENERGY/runtime | ||||
| Power DRAM [W] = PWR_DRAM_ENERGY/runtime | ||||
| DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime | ||||
| AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime | ||||
| Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime | ||||
| Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime | ||||
| Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/runtime | ||||
| Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0 | ||||
| Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/runtime | ||||
| Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0 | ||||
| Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/runtime | ||||
| Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0 | ||||
| Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0) | ||||
| -- | ||||
| Profiling group to measure memory bandwidth drawn by all cores of a socket. | ||||
| Since this group is based on Uncore events it is only possible to measure on | ||||
| a per socket base. Also outputs total data volume transferred from main memory. | ||||
| SSE scalar and packed double precision FLOP rates. Also reports on packed AVX | ||||
| 32b instructions. | ||||
| The operational intensity is calculated using the FP values of the cores and the | ||||
| memory data volume of the whole socket. The actual operational intensity for | ||||
| multiple CPUs can be found in the statistics table in the Sum column. | ||||
							
								
								
									
										70
									
								
								collectors/likwid/groups/CLX/MEM_SP.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										70
									
								
								collectors/likwid/groups/CLX/MEM_SP.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,70 @@ | ||||
| SHORT Overview of arithmetic and main memory performance | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PWR0  PWR_PKG_ENERGY | ||||
| PWR3  PWR_DRAM_ENERGY | ||||
| PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | ||||
| PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE | ||||
| PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | ||||
| PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | ||||
| MBOX0C0 CAS_COUNT_RD | ||||
| MBOX0C1 CAS_COUNT_WR | ||||
| MBOX1C0 CAS_COUNT_RD | ||||
| MBOX1C1 CAS_COUNT_WR | ||||
| MBOX2C0 CAS_COUNT_RD | ||||
| MBOX2C1 CAS_COUNT_WR | ||||
| MBOX3C0 CAS_COUNT_RD | ||||
| MBOX3C1 CAS_COUNT_WR | ||||
| MBOX4C0 CAS_COUNT_RD | ||||
| MBOX4C1 CAS_COUNT_WR | ||||
| MBOX5C0 CAS_COUNT_RD | ||||
| MBOX5C1 CAS_COUNT_WR | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Energy [J]  PWR0 | ||||
| Power [W] PWR0/time | ||||
| Energy DRAM [J]  PWR3 | ||||
| Power DRAM [W] PWR3/time | ||||
| SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time | ||||
| AVX SP [MFLOP/s]  1.0E-06*(PMC2*8.0+PMC3*16.0)/time | ||||
| Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time | ||||
| Scalar [MUOPS/s] 1.0E-06*PMC1/time | ||||
| Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time | ||||
| Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0 | ||||
| Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time | ||||
| Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 | ||||
| Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time | ||||
| Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 | ||||
| Operational intensity (PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0) | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Power [W] = PWR_PKG_ENERGY/runtime | ||||
| Power DRAM [W] = PWR_DRAM_ENERGY/runtime | ||||
| SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime | ||||
| AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime | ||||
| Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime | ||||
| Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime | ||||
| Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/runtime | ||||
| Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0 | ||||
| Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/runtime | ||||
| Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0 | ||||
| Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/runtime | ||||
| Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0 | ||||
| Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0) | ||||
| -- | ||||
| Profiling group to measure memory bandwidth drawn by all cores of a socket. | ||||
| Since this group is based on Uncore events it is only possible to measure on | ||||
| a per socket base. Also outputs total data volume transferred from main memory. | ||||
| SSE scalar and packed single precision FLOP rates. Also reports on packed AVX | ||||
| 32b instructions. | ||||
| The operational intensity is calculated using the FP values of the cores and the | ||||
| memory data volume of the whole socket. The actual operational intensity for | ||||
| multiple CPUs can be found in the statistics table in the Sum column. | ||||
							
								
								
									
										46
									
								
								collectors/likwid/groups/CLX/PMM.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										46
									
								
								collectors/likwid/groups/CLX/PMM.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,46 @@ | ||||
| SHORT Intel Optance DC bandwidth in MBytes/s | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| MBOX0C0 PMM_CMD1_RD | ||||
| MBOX0C1 PMM_CMD1_WR | ||||
| MBOX1C0 PMM_CMD1_RD | ||||
| MBOX1C1 PMM_CMD1_WR | ||||
| MBOX2C0 PMM_CMD1_RD | ||||
| MBOX2C1 PMM_CMD1_WR | ||||
| MBOX3C0 PMM_CMD1_RD | ||||
| MBOX3C1 PMM_CMD1_WR | ||||
| MBOX4C0 PMM_CMD1_RD | ||||
| MBOX4C1 PMM_CMD1_WR | ||||
| MBOX5C0 PMM_CMD1_RD | ||||
| MBOX5C1 PMM_CMD1_WR | ||||
|  | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| PMM read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time | ||||
| PMM read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0 | ||||
| PMM write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time | ||||
| PMM write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 | ||||
| PMM bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time | ||||
| PMM data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| PMM read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime | ||||
| PMM read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0 | ||||
| PMM write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime | ||||
| PMM write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0 | ||||
| PMM bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime | ||||
| PMM data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0 | ||||
| - | ||||
| Profiling group to measure data rate and volume for accesses to Intel Optane DC | ||||
| persistent memory. The Intel Optance DC devices are handled by the memory | ||||
| controllers but require different events. | ||||
|  | ||||
							
								
								
									
										35
									
								
								collectors/likwid/groups/CLX/TLB_DATA.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								collectors/likwid/groups/CLX/TLB_DATA.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,35 @@ | ||||
| SHORT  L2 data TLB miss rate/ratio | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK | ||||
| PMC1  DTLB_STORE_MISSES_CAUSES_A_WALK | ||||
| PMC2  DTLB_LOAD_MISSES_WALK_ACTIVE | ||||
| PMC3  DTLB_STORE_MISSES_WALK_ACTIVE | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| L1 DTLB load misses     PMC0 | ||||
| L1 DTLB load miss rate  PMC0/FIXC0 | ||||
| L1 DTLB load miss duration [Cyc] PMC2/PMC0 | ||||
| L1 DTLB store misses     PMC1 | ||||
| L1 DTLB store miss rate  PMC1/FIXC0 | ||||
| L1 DTLB store miss duration [Cyc] PMC3/PMC1 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK | ||||
| L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY | ||||
| L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_ACTIVE / DTLB_LOAD_MISSES_CAUSES_A_WALK | ||||
| L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK | ||||
| L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY | ||||
| L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_ACTIVE / DTLB_STORE_MISSES_CAUSES_A_WALK | ||||
| - | ||||
| The DTLB load and store miss rates gives a measure how often a TLB miss occurred | ||||
| per instruction. The duration measures the time in cycles how long a walk did take. | ||||
|  | ||||
							
								
								
									
										28
									
								
								collectors/likwid/groups/CLX/TLB_INSTR.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										28
									
								
								collectors/likwid/groups/CLX/TLB_INSTR.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,28 @@ | ||||
| SHORT  L1 Instruction TLB miss rate/ratio | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  ITLB_MISSES_CAUSES_A_WALK | ||||
| PMC1  ITLB_MISSES_WALK_ACTIVE | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| L1 ITLB misses     PMC0 | ||||
| L1 ITLB miss rate  PMC0/FIXC0 | ||||
| L1 ITLB miss duration [Cyc] PMC1/PMC0 | ||||
|  | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK | ||||
| L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY | ||||
| L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_ACTIVE / ITLB_MISSES_CAUSES_A_WALK | ||||
| - | ||||
| The ITLB miss rates gives a measure how often a TLB miss occurred | ||||
| per instruction. The duration measures the time in cycles how long a walk did take. | ||||
|  | ||||
							
								
								
									
										48
									
								
								collectors/likwid/groups/CLX/TMA.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								collectors/likwid/groups/CLX/TMA.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,48 @@ | ||||
| SHORT Top down cycle allocation | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0 UOPS_ISSUED_ANY | ||||
| PMC1 UOPS_RETIRED_RETIRE_SLOTS | ||||
| PMC2 IDQ_UOPS_NOT_DELIVERED_CORE | ||||
| PMC3 INT_MISC_RECOVERY_CYCLES | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| IPC FIXC0/FIXC1 | ||||
| Total Slots 4*FIXC1 | ||||
| Slots Retired PMC1 | ||||
| Fetch Bubbles PMC2 | ||||
| Recovery Bubbles 4*PMC3 | ||||
| Front End [%] PMC2/(4*FIXC1)*100 | ||||
| Speculation [%] (PMC0-PMC1+(4*PMC3))/(4*FIXC1)*100 | ||||
| Retiring [%] PMC1/(4*FIXC1)*100 | ||||
| Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Total Slots = 4*CPU_CLK_UNHALTED_CORE | ||||
| Slots Retired = UOPS_RETIRED_RETIRE_SLOTS | ||||
| Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE | ||||
| Recovery Bubbles = 4*INT_MISC_RECOVERY_CYCLES | ||||
| Front End [%] = IDQ_UOPS_NOT_DELIVERED_CORE/(4*CPU_CLK_UNHALTED_CORE)*100 | ||||
| Speculation [%] = (UOPS_ISSUED_ANY-UOPS_RETIRED_RETIRE_SLOTS+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)*100 | ||||
| Retiring [%] = UOPS_RETIRED_RETIRE_SLOTS/(4*CPU_CLK_UNHALTED_CORE)*100 | ||||
| Back End [%] = (1-((IDQ_UOPS_NOT_DELIVERED_CORE+UOPS_ISSUED_ANY+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)))*100 | ||||
| -- | ||||
| This performance group measures cycles to determine percentage of time spent in | ||||
| front end, back end, retiring and speculation. These metrics are published and | ||||
| verified by Intel. Further information: | ||||
| Webpage describing Top-Down Method and its usage in Intel vTune: | ||||
| https://software.intel.com/en-us/vtune-amplifier-help-tuning-applications-using-a-top-down-microarchitecture-analysis-method | ||||
| Paper by Yasin Ahmad: | ||||
| https://sites.google.com/site/analysismethods/yasin-pubs/TopDown-Yasin-ISPASS14.pdf?attredirects=0 | ||||
| Slides by Yasin Ahmad: | ||||
| http://www.cs.technion.ac.il/~erangi/TMA_using_Linux_perf__Ahmad_Yasin.pdf | ||||
| The performance group was originally published here: | ||||
| http://perf.mvermeulen.com/2018/04/14/top-down-performance-counter-analysis-part-1-likwid/ | ||||
							
								
								
									
										31
									
								
								collectors/likwid/groups/CLX/UOPS_EXEC.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										31
									
								
								collectors/likwid/groups/CLX/UOPS_EXEC.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,31 @@ | ||||
| SHORT UOPs execution | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  UOPS_EXECUTED_USED_CYCLES | ||||
| PMC1  UOPS_EXECUTED_STALL_CYCLES | ||||
| PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES | ||||
| PMC3:EDGEDETECT  UOPS_EXECUTED_STALL_CYCLES | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Used cycles ratio [%] 100*PMC0/PMC2 | ||||
| Unused cycles ratio [%] 100*PMC1/PMC2 | ||||
| Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT | ||||
|  | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE | ||||
| Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE | ||||
| Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT | ||||
| - | ||||
| This performance group returns the ratios of used and unused cycles regarding | ||||
| the execution stage in the pipeline. Used cycles are all cycles where uOPs are | ||||
| executed while unused cycles refer to pipeline stalls. Moreover, the group | ||||
| calculates the average stall duration in cycles. | ||||
							
								
								
									
										31
									
								
								collectors/likwid/groups/CLX/UOPS_ISSUE.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										31
									
								
								collectors/likwid/groups/CLX/UOPS_ISSUE.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,31 @@ | ||||
| SHORT UOPs issueing | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  UOPS_ISSUED_USED_CYCLES | ||||
| PMC1  UOPS_ISSUED_STALL_CYCLES | ||||
| PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES | ||||
| PMC3:EDGEDETECT  UOPS_ISSUED_STALL_CYCLES | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Used cycles ratio [%] 100*PMC0/PMC2 | ||||
| Unused cycles ratio [%] 100*PMC1/PMC2 | ||||
| Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT | ||||
|  | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE | ||||
| Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE | ||||
| Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT | ||||
| - | ||||
| This performance group returns the ratios of used and unused cycles regarding | ||||
| the issue stage in the pipeline. Used cycles are all cycles where uOPs are | ||||
| issued while unused cycles refer to pipeline stalls. Moreover, the group | ||||
| calculates the average stall duration in cycles. | ||||
							
								
								
									
										31
									
								
								collectors/likwid/groups/CLX/UOPS_RETIRE.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										31
									
								
								collectors/likwid/groups/CLX/UOPS_RETIRE.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,31 @@ | ||||
| SHORT UOPs retirement | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  UOPS_RETIRED_USED_CYCLES | ||||
| PMC1  UOPS_RETIRED_STALL_CYCLES | ||||
| PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES | ||||
| PMC3:EDGEDETECT  UOPS_RETIRED_STALL_CYCLES | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Used cycles ratio [%] 100*PMC0/PMC2 | ||||
| Unused cycles ratio [%] 100*PMC1/PMC2 | ||||
| Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT | ||||
|  | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE | ||||
| Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE | ||||
| Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT | ||||
| - | ||||
| This performance group returns the ratios of used and unused cycles regarding | ||||
| the retirement stage in the pipeline (re-order buffer). Used cycles are all | ||||
| cycles where uOPs are retired while unused cycles refer to pipeline stalls. | ||||
| Moreover, the group calculates the average stall duration in cycles. | ||||
							
								
								
									
										42
									
								
								collectors/likwid/groups/CLX/UPI.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										42
									
								
								collectors/likwid/groups/CLX/UPI.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,42 @@ | ||||
| SHORT UPI data traffic | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| SBOX0C0 TXL_FLITS_ALL_DATA | ||||
| SBOX0C1 RXL_FLITS_ALL_DATA | ||||
| SBOX1C0 TXL_FLITS_ALL_DATA | ||||
| SBOX1C1 RXL_FLITS_ALL_DATA | ||||
| SBOX2C0 TXL_FLITS_ALL_DATA | ||||
| SBOX2C1 RXL_FLITS_ALL_DATA | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Received data bandwidth [MByte/s] 1.0E-06*((SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0/time | ||||
| Received data volume [GByte] 1.0E-09*((SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0 | ||||
| Sent data bandwidth [MByte/s] 1.0E-06*((SBOX0C0+SBOX1C0+SBOX2C0)/9.0)*64.0/time | ||||
| Sent data volume [GByte] 1.0E-09*((SBOX0C0+SBOX1C0+SBOX2C0)/9.0)*64.0 | ||||
| Total data bandwidth [MByte/s] 1.0E-06*((SBOX0C0+SBOX1C0+SBOX2C0+SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0/time | ||||
| Total data volume [GByte] 1.0E-09*((SBOX0C0+SBOX1C0+SBOX2C0+SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0 | ||||
|  | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Received data bandwidth [MByte/s] = 1.0E-06*(SUM(RXL_FLITS_ALL_DATA)/9.0)*64.0/runtime | ||||
| Received data volume [GByte] = 1.0E-09*(SUM(RXL_FLITS_ALL_DATA)/9.0)*64.0 | ||||
| Sent data bandwidth [MByte/s] = 1.0E-06*(SUM(TXL_FLITS_ALL_DATA)/9.0)*64.0/time | ||||
| Sent data volume [GByte] = 1.0E-09*(SUM(TXL_FLITS_ALL_DATA)/9.0)*64.0 | ||||
| Total data bandwidth [MByte/s] = 1.0E-06*((SUM(RXL_FLITS_ALL_DATA)+SUM(TXL_FLITS_ALL_DATA))/9.0)*64.0/time | ||||
| Total data volume [GByte] = 1.0E-09*((SUM(RXL_FLITS_ALL_DATA)+SUM(TXL_FLITS_ALL_DATA))/9.0)*64.0 | ||||
| -- | ||||
| This group measures the data traffic on the UPI (socket interconnect). The group | ||||
| measures all filled data slots (9 slots per 64 Byte data transfer), that's why | ||||
| the count needs to be divided by 9. These 9 data chunks are not transferred in | ||||
| a single flit but there is one flit for the header and three flits for the data. | ||||
| The metrics show higher values as expected because the events count also | ||||
| different transfers which include data. | ||||
							
								
								
									
										31
									
								
								collectors/likwid/groups/ICL/BRANCH.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										31
									
								
								collectors/likwid/groups/ICL/BRANCH.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,31 @@ | ||||
| SHORT Branch prediction miss rate/ratio | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  BR_INST_RETIRED_ALL_BRANCHES | ||||
| PMC1  BR_MISP_RETIRED_ALL_BRANCHES | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Branch rate   PMC0/FIXC0 | ||||
| Branch misprediction rate  PMC1/FIXC0 | ||||
| Branch misprediction ratio  PMC1/PMC0 | ||||
| Instructions per branch  FIXC0/PMC0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY | ||||
| Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY | ||||
| Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES | ||||
| Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES | ||||
| - | ||||
| The rates state how often on average a branch or a mispredicted branch occurred | ||||
| per instruction retired in total. The branch misprediction ratio sets directly | ||||
| into relation what ratio of all branch instruction where mispredicted. | ||||
| Instructions per branch is 1/branch rate. | ||||
|  | ||||
							
								
								
									
										22
									
								
								collectors/likwid/groups/ICL/DATA.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								collectors/likwid/groups/ICL/DATA.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,22 @@ | ||||
| SHORT Load to store ratio | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  MEM_INST_RETIRED_ALL_LOADS | ||||
| PMC1  MEM_INST_RETIRED_ALL_STORES | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Load to store ratio PMC0/PMC1 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Load to store ratio = MEM_INST_RETIRED_ALL_LOADS/MEM_INST_RETIRED_ALL_STORES | ||||
| - | ||||
| This is a metric to determine your load to store ratio. | ||||
|  | ||||
							
								
								
									
										24
									
								
								collectors/likwid/groups/ICL/DIVIDE.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								collectors/likwid/groups/ICL/DIVIDE.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,24 @@ | ||||
| SHORT Divide unit information | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  ARITH_DIVIDER_COUNT | ||||
| PMC1  ARITH_DIVIDER_ACTIVE | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Number of divide ops PMC0 | ||||
| Avg. divide unit usage duration PMC1/PMC0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Number of divide ops = ARITH_DIVIDER_COUNT | ||||
| Avg. divide unit usage duration = ARITH_DIVIDER_ACTIVE/ARITH_DIVIDER_COUNT | ||||
| - | ||||
| This performance group measures the average latency of divide operations | ||||
							
								
								
									
										35
									
								
								collectors/likwid/groups/ICL/ENERGY.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								collectors/likwid/groups/ICL/ENERGY.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,35 @@ | ||||
| SHORT Power and Energy consumption | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| TMP0  TEMP_CORE | ||||
| PWR0  PWR_PKG_ENERGY | ||||
| PWR1  PWR_PP0_ENERGY | ||||
| PWR3  PWR_DRAM_ENERGY | ||||
|  | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Temperature [C]  TMP0 | ||||
| Energy [J]  PWR0 | ||||
| Power [W] PWR0/time | ||||
| Energy PP0 [J]  PWR1 | ||||
| Power PP0 [W] PWR1/time | ||||
| Energy DRAM [J]  PWR3 | ||||
| Power DRAM [W] PWR3/time | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Power = PWR_PKG_ENERGY / time | ||||
| Power PP0 = PWR_PP0_ENERGY / time | ||||
| Power DRAM = PWR_DRAM_ENERGY / time | ||||
| - | ||||
| Broadwell implements the new RAPL interface. This interface enables to | ||||
| monitor the consumed energy on the package (socket)  and DRAM level. | ||||
|  | ||||
							
								
								
									
										25
									
								
								collectors/likwid/groups/ICL/FLOPS_AVX.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								collectors/likwid/groups/ICL/FLOPS_AVX.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,25 @@ | ||||
| SHORT Packed AVX MFLOP/s | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | ||||
| PMC1  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | ||||
| PMC2  FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | ||||
| PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0+PMC2*16.0)/time | ||||
| Packed DP [MFLOP/s]  1.0E-06*(PMC1*4.0+PMC3*8.0)/time | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime | ||||
| Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime | ||||
| - | ||||
| Packed 32b AVX FLOPs rates. | ||||
							
								
								
									
										34
									
								
								collectors/likwid/groups/ICL/FLOPS_DP.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										34
									
								
								collectors/likwid/groups/ICL/FLOPS_DP.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,34 @@ | ||||
| SHORT Double Precision MFLOP/s | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | ||||
| PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | ||||
| PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | ||||
| PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time | ||||
| AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time | ||||
| AVX512 DP [MFLOP/s]  1.0E-06*(PMC3*8.0)/time | ||||
| Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time | ||||
| Scalar [MUOPS/s] 1.0E-06*PMC1/time | ||||
| Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3) | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime | ||||
| AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime | ||||
| AVX512 DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime | ||||
| Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime | ||||
| Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime | ||||
| Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE) | ||||
| - | ||||
| SSE scalar and packed double precision FLOP rates. | ||||
|  | ||||
							
								
								
									
										34
									
								
								collectors/likwid/groups/ICL/FLOPS_SP.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										34
									
								
								collectors/likwid/groups/ICL/FLOPS_SP.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,34 @@ | ||||
| SHORT Single Precision MFLOP/s | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | ||||
| PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE | ||||
| PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | ||||
| PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time | ||||
| AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time | ||||
| AVX512 SP [MFLOP/s]  1.0E-06*(PMC3*16.0)/time | ||||
| Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time | ||||
| Scalar [MUOPS/s] 1.0E-06*PMC1/time | ||||
| Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3) | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime | ||||
| AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime | ||||
| AVX512 SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime | ||||
| Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime | ||||
| Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime | ||||
| Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE) | ||||
| - | ||||
| SSE scalar and packed single precision FLOP rates. | ||||
|  | ||||
							
								
								
									
										32
									
								
								collectors/likwid/groups/ICX/BRANCH.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										32
									
								
								collectors/likwid/groups/ICX/BRANCH.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,32 @@ | ||||
| SHORT Branch prediction miss rate/ratio | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| FIXC3 TOPDOWN_SLOTS | ||||
| PMC0  BR_INST_RETIRED_ALL_BRANCHES | ||||
| PMC1  BR_MISP_RETIRED_ALL_BRANCHES | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Branch rate   PMC0/FIXC0 | ||||
| Branch misprediction rate  PMC1/FIXC0 | ||||
| Branch misprediction ratio  PMC1/PMC0 | ||||
| Instructions per branch  FIXC0/PMC0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY | ||||
| Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY | ||||
| Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES | ||||
| Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES | ||||
| - | ||||
| The rates state how often on average a branch or a mispredicted branch occurred | ||||
| per instruction retired in total. The branch misprediction ratio sets directly | ||||
| into relation what ratio of all branch instruction where mispredicted. | ||||
| Instructions per branch is 1/branch rate. | ||||
|  | ||||
							
								
								
									
										23
									
								
								collectors/likwid/groups/ICX/DATA.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								collectors/likwid/groups/ICX/DATA.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,23 @@ | ||||
| SHORT Load to store ratio | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| FIXC3 TOPDOWN_SLOTS | ||||
| PMC0  MEM_INST_RETIRED_ALL_LOADS | ||||
| PMC1  MEM_INST_RETIRED_ALL_STORES | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Load to store ratio PMC0/PMC1 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Load to store ratio = MEM_INST_RETIRED_ALL_LOADS/MEM_INST_RETIRED_ALL_STORES | ||||
| - | ||||
| This is a metric to determine your load to store ratio. | ||||
|  | ||||
							
								
								
									
										25
									
								
								collectors/likwid/groups/ICX/DIVIDE.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								collectors/likwid/groups/ICX/DIVIDE.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,25 @@ | ||||
| SHORT Divide unit information | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| FIXC3 TOPDOWN_SLOTS | ||||
| PMC0  ARITH_DIVIDER_COUNT | ||||
| PMC1  ARITH_DIVIDER_ACTIVE | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Number of divide ops PMC0 | ||||
| Avg. divide unit usage duration PMC1/PMC0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Number of divide ops = ARITH_DIVIDER_COUNT | ||||
| Avg. divide unit usage duration = ARITH_DIVIDER_ACTIVE/ARITH_DIVIDER_COUNT | ||||
| - | ||||
| This performance group measures the average latency of divide operations | ||||
							
								
								
									
										26
									
								
								collectors/likwid/groups/ICX/FLOPS_AVX.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								collectors/likwid/groups/ICX/FLOPS_AVX.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,26 @@ | ||||
| SHORT Packed AVX MFLOP/s | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| FIXC3 TOPDOWN_SLOTS | ||||
| PMC0  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | ||||
| PMC1  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | ||||
| PMC2  FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | ||||
| PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0+PMC2*16.0)/time | ||||
| Packed DP [MFLOP/s]  1.0E-06*(PMC1*4.0+PMC3*8.0)/time | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime | ||||
| Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime | ||||
| - | ||||
| Packed 32b AVX FLOPs rates. | ||||
							
								
								
									
										35
									
								
								collectors/likwid/groups/ICX/FLOPS_DP.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								collectors/likwid/groups/ICX/FLOPS_DP.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,35 @@ | ||||
| SHORT Double Precision MFLOP/s | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| FIXC3 TOPDOWN_SLOTS | ||||
| PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | ||||
| PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | ||||
| PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | ||||
| PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time | ||||
| AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time | ||||
| AVX512 DP [MFLOP/s]  1.0E-06*(PMC3*8.0)/time | ||||
| Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time | ||||
| Scalar [MUOPS/s] 1.0E-06*PMC1/time | ||||
| Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3) | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime | ||||
| AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime | ||||
| AVX512 DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime | ||||
| Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime | ||||
| Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime | ||||
| Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE) | ||||
| - | ||||
| SSE scalar and packed double precision FLOP rates. | ||||
|  | ||||
							
								
								
									
										35
									
								
								collectors/likwid/groups/ICX/FLOPS_SP.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								collectors/likwid/groups/ICX/FLOPS_SP.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,35 @@ | ||||
| SHORT Single Precision MFLOP/s | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| FIXC3 TOPDOWN_SLOTS | ||||
| PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | ||||
| PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE | ||||
| PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | ||||
| PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time | ||||
| AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time | ||||
| AVX512 SP [MFLOP/s]  1.0E-06*(PMC3*16.0)/time | ||||
| Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time | ||||
| Scalar [MUOPS/s] 1.0E-06*PMC1/time | ||||
| Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3) | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime | ||||
| AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime | ||||
| AVX512 SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime | ||||
| Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime | ||||
| Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime | ||||
| Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE) | ||||
| - | ||||
| SSE scalar and packed single precision FLOP rates. | ||||
|  | ||||
							
								
								
									
										39
									
								
								collectors/likwid/groups/ICX/L2.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										39
									
								
								collectors/likwid/groups/ICX/L2.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,39 @@ | ||||
| SHORT L2 cache bandwidth in MBytes/s | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| FIXC3 TOPDOWN_SLOTS | ||||
| PMC0  L1D_REPLACEMENT | ||||
| PMC1  L2_TRANS_L1D_WB | ||||
| PMC2  ICACHE_64B_IFTAG_MISS | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time | ||||
| L2D load data volume [GBytes]  1.0E-09*PMC0*64.0 | ||||
| L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time | ||||
| L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0 | ||||
| L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time | ||||
| L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time | ||||
| L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0 | ||||
| L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time | ||||
| L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0 | ||||
| L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_64B_IFTAG_MISS)*64/time | ||||
| L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_64B_IFTAG_MISS)*64 | ||||
| - | ||||
| Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the | ||||
| number of cache line allocated in the L1 and the number of modified cache lines | ||||
| evicted from the L1. The group also output total data volume transferred between | ||||
| L2 and L1. Note that this bandwidth also includes data transfers due to a write | ||||
| allocate load on a store miss in L1 and traffic caused by misses in the | ||||
| L1 instruction cache. | ||||
|  | ||||
							
								
								
									
										31
									
								
								collectors/likwid/groups/TGL/BRANCH.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										31
									
								
								collectors/likwid/groups/TGL/BRANCH.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,31 @@ | ||||
| SHORT Branch prediction miss rate/ratio | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  BR_INST_RETIRED_ALL_BRANCHES | ||||
| PMC1  BR_MISP_RETIRED_ALL_BRANCHES | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Branch rate   PMC0/FIXC0 | ||||
| Branch misprediction rate  PMC1/FIXC0 | ||||
| Branch misprediction ratio  PMC1/PMC0 | ||||
| Instructions per branch  FIXC0/PMC0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY | ||||
| Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY | ||||
| Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES | ||||
| Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES | ||||
| - | ||||
| The rates state how often on average a branch or a mispredicted branch occurred | ||||
| per instruction retired in total. The branch misprediction ratio sets directly | ||||
| into relation what ratio of all branch instruction where mispredicted. | ||||
| Instructions per branch is 1/branch rate. | ||||
|  | ||||
							
								
								
									
										22
									
								
								collectors/likwid/groups/TGL/DATA.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								collectors/likwid/groups/TGL/DATA.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,22 @@ | ||||
| SHORT Load to store ratio | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  MEM_INST_RETIRED_ALL_LOADS | ||||
| PMC1  MEM_INST_RETIRED_ALL_STORES | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Load to store ratio PMC0/PMC1 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Load to store ratio = MEM_INST_RETIRED_ALL_LOADS/MEM_INST_RETIRED_ALL_STORES | ||||
| - | ||||
| This is a metric to determine your load to store ratio. | ||||
|  | ||||
							
								
								
									
										24
									
								
								collectors/likwid/groups/TGL/DIVIDE.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								collectors/likwid/groups/TGL/DIVIDE.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,24 @@ | ||||
| SHORT Divide unit information | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  ARITH_DIVIDER_COUNT | ||||
| PMC1  ARITH_DIVIDER_ACTIVE | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Number of divide ops PMC0 | ||||
| Avg. divide unit usage duration PMC1/PMC0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Number of divide ops = ARITH_DIVIDER_COUNT | ||||
| Avg. divide unit usage duration = ARITH_DIVIDER_ACTIVE/ARITH_DIVIDER_COUNT | ||||
| - | ||||
| This performance group measures the average latency of divide operations | ||||
							
								
								
									
										35
									
								
								collectors/likwid/groups/TGL/ENERGY.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								collectors/likwid/groups/TGL/ENERGY.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,35 @@ | ||||
| SHORT Power and Energy consumption | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| TMP0  TEMP_CORE | ||||
| PWR0  PWR_PKG_ENERGY | ||||
| PWR1  PWR_PP0_ENERGY | ||||
| PWR3  PWR_DRAM_ENERGY | ||||
|  | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Temperature [C]  TMP0 | ||||
| Energy [J]  PWR0 | ||||
| Power [W] PWR0/time | ||||
| Energy PP0 [J]  PWR1 | ||||
| Power PP0 [W] PWR1/time | ||||
| Energy DRAM [J]  PWR3 | ||||
| Power DRAM [W] PWR3/time | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Power = PWR_PKG_ENERGY / time | ||||
| Power PP0 = PWR_PP0_ENERGY / time | ||||
| Power DRAM = PWR_DRAM_ENERGY / time | ||||
| - | ||||
| Broadwell implements the new RAPL interface. This interface enables to | ||||
| monitor the consumed energy on the package (socket)  and DRAM level. | ||||
|  | ||||
							
								
								
									
										25
									
								
								collectors/likwid/groups/TGL/FLOPS_AVX.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								collectors/likwid/groups/TGL/FLOPS_AVX.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,25 @@ | ||||
| SHORT Packed AVX MFLOP/s | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | ||||
| PMC1  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | ||||
| PMC2  FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | ||||
| PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0+PMC2*16.0)/time | ||||
| Packed DP [MFLOP/s]  1.0E-06*(PMC1*4.0+PMC3*8.0)/time | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime | ||||
| Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime | ||||
| - | ||||
| Packed 32b AVX FLOPs rates. | ||||
							
								
								
									
										34
									
								
								collectors/likwid/groups/TGL/FLOPS_DP.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										34
									
								
								collectors/likwid/groups/TGL/FLOPS_DP.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,34 @@ | ||||
| SHORT Double Precision MFLOP/s | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | ||||
| PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | ||||
| PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | ||||
| PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time | ||||
| AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time | ||||
| AVX512 DP [MFLOP/s]  1.0E-06*(PMC3*8.0)/time | ||||
| Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time | ||||
| Scalar [MUOPS/s] 1.0E-06*PMC1/time | ||||
| Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3) | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime | ||||
| AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime | ||||
| AVX512 DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime | ||||
| Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime | ||||
| Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime | ||||
| Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE) | ||||
| - | ||||
| SSE scalar and packed double precision FLOP rates. | ||||
|  | ||||
							
								
								
									
										34
									
								
								collectors/likwid/groups/TGL/FLOPS_SP.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										34
									
								
								collectors/likwid/groups/TGL/FLOPS_SP.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,34 @@ | ||||
| SHORT Single Precision MFLOP/s | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | ||||
| PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE | ||||
| PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | ||||
| PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time | ||||
| AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time | ||||
| AVX512 SP [MFLOP/s]  1.0E-06*(PMC3*16.0)/time | ||||
| Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time | ||||
| Scalar [MUOPS/s] 1.0E-06*PMC1/time | ||||
| Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3) | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime | ||||
| AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime | ||||
| AVX512 SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime | ||||
| Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime | ||||
| Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime | ||||
| Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE) | ||||
| - | ||||
| SSE scalar and packed single precision FLOP rates. | ||||
|  | ||||
							
								
								
									
										30
									
								
								collectors/likwid/groups/arm64fx/BRANCH.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								collectors/likwid/groups/arm64fx/BRANCH.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,30 @@ | ||||
| SHORT Branch prediction miss rate/ratio | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  BR_PRED | ||||
| PMC3  BR_MIS_PRED | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| CPI  PMC1/PMC0 | ||||
| Branch rate   PMC2/PMC0 | ||||
| Branch misprediction rate  PMC3/PMC0 | ||||
| Branch misprediction ratio  PMC3/(PMC2+PMC3) | ||||
| Instructions per branch  PMC0/(PMC2+PMC3) | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| CPI = CPU_CYCLES/INST_RETIRED | ||||
| Branch rate = BR_PRED/INST_RETIRED | ||||
| Branch misprediction rate =  BR_MIS_PRED/INST_RETIRED | ||||
| Branch misprediction ratio = BR_MIS_PRED/(BR_PRED+BR_MIS_PRED) | ||||
| Instructions per branch = INSTR_RETIRED_ANY/(BR_PRED+BR_MIS_PRED) | ||||
| - | ||||
| The rates state how often in average a branch or a mispredicted branch occured | ||||
| per instruction retired in total. The Branch misprediction ratio sets directly | ||||
| into relation what ratio of all branch instruction where mispredicted. | ||||
| Instructions per branch is 1/Branch rate. | ||||
|  | ||||
							
								
								
									
										24
									
								
								collectors/likwid/groups/arm64fx/DATA.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								collectors/likwid/groups/arm64fx/DATA.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,24 @@ | ||||
| SHORT Load to store ratio | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_SPEC | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  LD_SPEC | ||||
| PMC3  ST_SPEC | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| CPI  PMC1/PMC0 | ||||
| Load to store ratio PMC2/PMC3 | ||||
| Load ratio PMC2/PMC0 | ||||
| Store ratio PMC3/PMC0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| CPI = CPU_CYCLES/INST_SPEC | ||||
| Load to store ratio = LD_SPEC / ST_SPEC | ||||
| Load ratio = LD_SPEC / INST_SPEC | ||||
| Store ratio = ST_SPEC / INST_SPEC | ||||
| - | ||||
| This is a metric to determine your load to store ratio. | ||||
|  | ||||
							
								
								
									
										26
									
								
								collectors/likwid/groups/arm64fx/FLOPS_DP.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								collectors/likwid/groups/arm64fx/FLOPS_DP.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,26 @@ | ||||
| SHORT Double Precision MFLOP/s | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC3  FP_DP_FIXED_OPS_SPEC | ||||
| PMC4  FP_DP_SCALE_OPS_SPEC | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Clock [MHz] 1.E-06*PMC1/time | ||||
| CPI  PMC1/PMC0 | ||||
| DP (FP) [MFLOP/s] 1E-06*(PMC3)/time | ||||
| DP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC4*128)/128)+PMC3)/time | ||||
| DP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC4*256)/128)+PMC3)/time | ||||
| DP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC4*512)/128)+PMC3)/time | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| DP (FP) [MFLOP/s] = 1E-06*FP_DP_FIXED_OPS_SPEC/time | ||||
| DP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*128)/128))/time | ||||
| DP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*256)/128))/time | ||||
| DP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*512)/128))/time | ||||
| - | ||||
| Double-precision FP rate for scalar and SVE vector operations with different widths. The events for | ||||
| the SVE metrics assumes that all vector elements are active. | ||||
							
								
								
									
										26
									
								
								collectors/likwid/groups/arm64fx/FLOPS_HP.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								collectors/likwid/groups/arm64fx/FLOPS_HP.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,26 @@ | ||||
| SHORT Half-Precision MFLOP/s | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC3  FP_HP_FIXED_OPS_SPEC | ||||
| PMC4  FP_HP_SCALE_OPS_SPEC | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Clock [MHz] 1.E-06*PMC1/time | ||||
| CPI  PMC1/PMC0 | ||||
| HP (FP) [MFLOP/s] 1E-06*(PMC3)/time | ||||
| HP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC4*128)/128)+PMC3)/time | ||||
| HP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC4*256)/128)+PMC3)/time | ||||
| HP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC4*512)/128)+PMC3)/time | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| HP (FP) [MFLOP/s] = 1E-06*FP_HP_FIXED_OPS_SPEC/time | ||||
| HP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_SPEC+((FP_HP_SCALE_OPS_SPEC*128)/128))/time | ||||
| HP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_SPEC+((FP_HP_SCALE_OPS_SPEC*256)/128))/time | ||||
| HP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_SPEC+((FP_HP_SCALE_OPS_SPEC*512)/128))/time | ||||
| - | ||||
| Half-precision FP rate for scalar and SVE vector operations with different widths. The events for | ||||
| the SVE metrics assumes that all vector elements are active. | ||||
							
								
								
									
										26
									
								
								collectors/likwid/groups/arm64fx/FLOPS_SP.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								collectors/likwid/groups/arm64fx/FLOPS_SP.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,26 @@ | ||||
| SHORT Single Precision MFLOP/s | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC3  FP_SP_FIXED_OPS_SPEC | ||||
| PMC4  FP_SP_SCALE_OPS_SPEC | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Clock [MHz] 1.E-06*PMC1/time | ||||
| CPI  PMC1/PMC0 | ||||
| SP (FP) [MFLOP/s] 1E-06*(PMC3)/time | ||||
| SP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC4*128)/128)+PMC3)/time | ||||
| SP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC4*256)/128)+PMC3)/time | ||||
| SP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC4*512)/128)+PMC3)/time | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| SP (FP) [MFLOP/s] = 1E-06*FP_SP_FIXED_OPS_SPEC/time | ||||
| SP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*128)/128))/time | ||||
| SP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*256)/128))/time | ||||
| SP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*512)/128))/time | ||||
| - | ||||
| Single-precision FP rate for scalar and SVE vector operations with different widths. The events for | ||||
| the SVE metrics assumes that all vector elements are active. | ||||
							
								
								
									
										33
									
								
								collectors/likwid/groups/arm64fx/FP_PIPE.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								collectors/likwid/groups/arm64fx/FP_PIPE.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,33 @@ | ||||
| SHORT Utilization of FP pipelines | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  FLA_VAL | ||||
| PMC3  FLA_VAL_PRD_CNT | ||||
| PMC4  FLB_VAL | ||||
| PMC5  FLB_VAL_PRD_CNT | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| CPI  PMC1/PMC0 | ||||
| FP operation pipeline A busy rate [%] (PMC2/PMC1)*100.0 | ||||
| FP pipeline A active element rate [%] (PMC3/(PMC2*16))*100.0 | ||||
| FP operation pipeline B busy rate [%] (PMC4/PMC1)*100.0 | ||||
| FP pipeline B active element rate [%] (PMC5/(PMC4*16))*100.0 | ||||
|  | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| CPI = CPU_CYCLES/INST_SPEC | ||||
| FP operation pipeline A busy rate [%] = (FLA_VAL/CPU_CYCLES)*100.0 | ||||
| FP pipeline A active element rate [%] = (FLA_VAL_PRD_CNT/(FLA_VAL*16))*100.0 | ||||
| FP operation pipeline B busy rate [%] = (FLB_VAL/CPU_CYCLES)*100.0 | ||||
| FP pipeline B active element rate [%] = (FLB_VAL_PRD_CNT/(FLB_VAL*16))*100.0 | ||||
| - | ||||
| FLx_VAL: This event counts valid cycles of FLx pipeline. | ||||
| FLx_VAL_PRD_CNT: This event counts the number of 1's in the predicate bits of | ||||
|                  request in FLA pipeline, where it is corrected so that it | ||||
|                  becomes 16 when all bits are 1. | ||||
| So each predicate mask has 16 slots, so there are 16 slots per cycle in FLA and | ||||
| FLB. FLA is partly used by other instructions like SVE stores. | ||||
							
								
								
									
										24
									
								
								collectors/likwid/groups/arm64fx/ICACHE.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								collectors/likwid/groups/arm64fx/ICACHE.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,24 @@ | ||||
| SHORT  Instruction cache miss rate/ratio | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  L1I_CACHE | ||||
| PMC3  L1I_CACHE_REFILL | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| CPI  PMC1/PMC0 | ||||
| L1I request rate PMC2/PMC0 | ||||
| L1I miss rate PMC3/PMC0 | ||||
| L1I miss ratio PMC3/PMC2 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| CPI = CPU_CYCLES/INST_RETIRED | ||||
| L1I request rate = L1I_CACHE / INST_RETIRED | ||||
| L1I miss rate = L1I_CACHE_REFILL / INST_RETIRED | ||||
| L1I miss ratio = L1I_CACHE_REFILL / L1I_CACHE | ||||
| - | ||||
| This group measures some L1 instruction cache metrics. | ||||
							
								
								
									
										40
									
								
								collectors/likwid/groups/arm64fx/L2.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										40
									
								
								collectors/likwid/groups/arm64fx/L2.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,40 @@ | ||||
| SHORT  L2 cache bandwidth in MBytes/s | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  L1D_CACHE_REFILL | ||||
| PMC3  L1D_CACHE_WB | ||||
| PMC4  L1I_CACHE_REFILL | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| CPI  PMC1/PMC0 | ||||
| L1D<-L2 load bandwidth [MBytes/s]  1.0E-06*(PMC2)*256.0/time | ||||
| L1D<-L2 load data volume [GBytes]  1.0E-09*(PMC2)*256.0 | ||||
| L1D->L2 evict bandwidth [MBytes/s]  1.0E-06*PMC3*256.0/time | ||||
| L1D->L2 evict data volume [GBytes]  1.0E-09*PMC3*256.0 | ||||
| L1I<-L2 load bandwidth [MBytes/s]  1.0E-06*PMC4*256.0/time | ||||
| L1I<-L2 load data volume [GBytes]  1.0E-09*PMC4*256.0 | ||||
| L1<->L2 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3+PMC4)*256.0/time | ||||
| L1<->L2 data volume [GBytes] 1.0E-09*(PMC2+PMC3+PMC4)*256.0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| CPI = CPU_CYCLES/INST_RETIRED | ||||
| L1D<-L2 load bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_REFILL*256.0/time | ||||
| L1D<-L2 load data volume [GBytes] = 1.0E-09*L1D_CACHE_REFILL*256.0 | ||||
| L1D->L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_WB*256.0/time | ||||
| L1D->L2 evict data volume [GBytes] = 1.0E-09*L1D_CACHE_WB*256.0 | ||||
| L1I<-L2 load bandwidth [MBytes/s] = 1.0E-06*L1I_CACHE_REFILL*256.0/time | ||||
| L1I<-L2 load data volume [GBytes] = 1.0E-09*L1I_CACHE_REFILL*256.0 | ||||
| L1<->L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*256.0/time | ||||
| L1<->L2 data volume [GBytes] = 1.0E-09*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*256.0 | ||||
| - | ||||
| Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the | ||||
| number of cacheline loaded from the L2 to the L1 data cache and the writebacks from | ||||
| the L1 data cache to the L2 cache. The group also outputs total data volume transfered between | ||||
| L2 and L1. Note that this bandwidth also includes data transfers due to a write | ||||
| allocate load on a store miss in L1 and cachelines transfered in the L1 instruction | ||||
| cache. | ||||
							
								
								
									
										29
									
								
								collectors/likwid/groups/arm64fx/MEM.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								collectors/likwid/groups/arm64fx/MEM.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,29 @@ | ||||
| SHORT Main memory bandwidth in MBytes/s | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  BUS_READ_TOTAL_MEM | ||||
| PMC3  BUS_WRITE_TOTAL_MEM | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| CPI  PMC1/PMC0 | ||||
| Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time | ||||
| Memory read data volume [GBytes] 1.0E-09*(PMC2)*256.0 | ||||
| Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time | ||||
| Memory write data volume [GBytes] 1.0E-09*(PMC3)*256.0 | ||||
| Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time | ||||
| Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Memory read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM)*256.0/runtime | ||||
| Memory read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM)*256.0 | ||||
| Memory write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_MEM)*256.0/runtime | ||||
| Memory write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_MEM)*256.0 | ||||
| Memory bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0/runtime | ||||
| Memory data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0 | ||||
| - | ||||
| Profiling group to measure memory bandwidth. The cache line size is 256 Byte. | ||||
							
								
								
									
										50
									
								
								collectors/likwid/groups/arm64fx/MEM_DP.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										50
									
								
								collectors/likwid/groups/arm64fx/MEM_DP.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,50 @@ | ||||
| SHORT Overview of arithmetic and main memory performance | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  BUS_READ_TOTAL_MEM | ||||
| PMC3  BUS_WRITE_TOTAL_MEM | ||||
| PMC4  FP_DP_FIXED_OPS_SPEC | ||||
| PMC5  FP_DP_SCALE_OPS_SPEC | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| CPI  PMC1/PMC0 | ||||
| DP (FP) [MFLOP/s] 1E-06*(PMC4)/time | ||||
| DP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC5*128.0)/128.0)+PMC4)/time | ||||
| DP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC5*256.0)/128.0)+PMC4)/time | ||||
| DP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC5*512.0)/128.0)+PMC4)/time | ||||
| Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time | ||||
| Memory read data volume [GBytes] 1.0E-09*(PMC2)*256.0 | ||||
| Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time | ||||
| Memory write data volume [GBytes] 1.0E-09*(PMC3)*256.0 | ||||
| Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time | ||||
| Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0 | ||||
| Operational intensity (FP) PMC4/((PMC2+PMC3)*256.0) | ||||
| Operational intensity (FP+SVE128) (((PMC5*128.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0) | ||||
| Operational intensity (FP+SVE256) (((PMC5*256.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0) | ||||
| Operational intensity (FP+SVE512) (((PMC5*512.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0) | ||||
|  | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| DP (FP) [MFLOP/s] = 1E-06*FP_DP_FIXED_OPS_SPEC/time | ||||
| DP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*128)/128))/time | ||||
| DP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*256)/128))/time | ||||
| DP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*512)/128))/time | ||||
| Memory read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM)*256.0/runtime | ||||
| Memory read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM)*256.0 | ||||
| Memory write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_MEM)*256.0/runtime | ||||
| Memory write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_MEM)*256.0 | ||||
| Memory bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0/runtime | ||||
| Memory data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0 | ||||
| Operational intensity (FP) = FP_DP_FIXED_OPS_SPEC/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0) | ||||
| Operational intensity (FP+SVE128) = (FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*128)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0) | ||||
| Operational intensity (FP+SVE256) = (FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*256)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0) | ||||
| Operational intensity (FP+SVE512) = (FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*512)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0) | ||||
| - | ||||
| Profiling group to measure memory bandwidth and double-precision FP rate for scalar and SVE vector | ||||
| operations with different widths. The events for the SVE metrics assumes that all vector elements | ||||
| are active. The cache line size is 256 Byte. | ||||
							
								
								
									
										50
									
								
								collectors/likwid/groups/arm64fx/MEM_HP.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										50
									
								
								collectors/likwid/groups/arm64fx/MEM_HP.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,50 @@ | ||||
| SHORT Overview of arithmetic and main memory performance | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  BUS_READ_TOTAL_MEM | ||||
| PMC3  BUS_WRITE_TOTAL_MEM | ||||
| PMC4  FP_HP_FIXED_OPS_HPEC | ||||
| PMC5  FP_HP_SCALE_OPS_HPEC | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| CPI  PMC1/PMC0 | ||||
| HP (FP) [MFLOP/s] 1E-06*(PMC4)/time | ||||
| HP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC5*128.0)/128.0)+PMC4)/time | ||||
| HP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC5*256.0)/128.0)+PMC4)/time | ||||
| HP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC5*512.0)/128.0)+PMC4)/time | ||||
| Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time | ||||
| Memory read data volume [GBytes] 1.0E-09*(PMC2)*256.0 | ||||
| Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time | ||||
| Memory write data volume [GBytes] 1.0E-09*(PMC3)*256.0 | ||||
| Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time | ||||
| Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0 | ||||
| Operational intensity (FP) PMC4/((PMC2+PMC3)*256.0) | ||||
| Operational intensity (FP+SVE128) (((PMC5*128.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0) | ||||
| Operational intensity (FP+SVE256) (((PMC5*256.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0) | ||||
| Operational intensity (FP+SVE512) (((PMC5*512.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0) | ||||
|  | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| HP (FP) [MFLOP/s] = 1E-06*FP_HP_FIXED_OPS_HPEC/time | ||||
| HP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*128)/128))/time | ||||
| HP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*256)/128))/time | ||||
| HP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*512)/128))/time | ||||
| Memory read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM)*256.0/runtime | ||||
| Memory read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM)*256.0 | ||||
| Memory write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_MEM)*256.0/runtime | ||||
| Memory write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_MEM)*256.0 | ||||
| Memory bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0/runtime | ||||
| Memory data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0 | ||||
| Operational intensity (FP) = FP_HP_FIXED_OPS_HPEC/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0) | ||||
| Operational intensity (FP+SVE128) = (FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*128)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0) | ||||
| Operational intensity (FP+SVE256) = (FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*256)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0) | ||||
| Operational intensity (FP+SVE512) = (FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*512)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0) | ||||
| - | ||||
| Profiling group to measure memory bandwidth and half-precision FP rate for scalar and SVE vector | ||||
| operations with different widths. The events for the SVE metrics assumes that all vector elements | ||||
| are active. The cache line size is 256 Byte. | ||||
							
								
								
									
										50
									
								
								collectors/likwid/groups/arm64fx/MEM_SP.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										50
									
								
								collectors/likwid/groups/arm64fx/MEM_SP.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,50 @@ | ||||
| SHORT Overview of arithmetic and main memory performance | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  BUS_READ_TOTAL_MEM | ||||
| PMC3  BUS_WRITE_TOTAL_MEM | ||||
| PMC4  FP_SP_FIXED_OPS_SPEC | ||||
| PMC5  FP_SP_SCALE_OPS_SPEC | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| CPI  PMC1/PMC0 | ||||
| SP (FP) [MFLOP/s] 1E-06*(PMC4)/time | ||||
| SP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC5*128.0)/128.0)+PMC4)/time | ||||
| SP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC5*256.0)/128.0)+PMC4)/time | ||||
| SP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC5*512.0)/128.0)+PMC4)/time | ||||
| Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time | ||||
| Memory read data volume [GBytes] 1.0E-09*(PMC2)*256.0 | ||||
| Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time | ||||
| Memory write data volume [GBytes] 1.0E-09*(PMC3)*256.0 | ||||
| Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time | ||||
| Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0 | ||||
| Operational intensity (FP) PMC4/((PMC2+PMC3)*256.0) | ||||
| Operational intensity (FP+SVE128) (((PMC5*128.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0) | ||||
| Operational intensity (FP+SVE256) (((PMC5*256.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0) | ||||
| Operational intensity (FP+SVE512) (((PMC5*512.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0) | ||||
|  | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| SP (FP) [MFLOP/s] = 1E-06*FP_SP_FIXED_OPS_SPEC/time | ||||
| SP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*128)/128))/time | ||||
| SP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*256)/128))/time | ||||
| SP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*512)/128))/time | ||||
| Memory read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM)*256.0/runtime | ||||
| Memory read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM)*256.0 | ||||
| Memory write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_MEM)*256.0/runtime | ||||
| Memory write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_MEM)*256.0 | ||||
| Memory bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0/runtime | ||||
| Memory data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0 | ||||
| Operational intensity (FP) = FP_SP_FIXED_OPS_SPEC/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0) | ||||
| Operational intensity (FP+SVE128) = (FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*128)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0) | ||||
| Operational intensity (FP+SVE256) = (FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*256)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0) | ||||
| Operational intensity (FP+SVE512) = (FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*512)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0) | ||||
| - | ||||
| Profiling group to measure memory bandwidth and single-precision FP rate for scalar and SVE vector | ||||
| operations with different widths. The events for the SVE metrics assumes that all vector elements | ||||
| are active. The cache line size is 256 Byte. | ||||
							
								
								
									
										29
									
								
								collectors/likwid/groups/arm64fx/PCI.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								collectors/likwid/groups/arm64fx/PCI.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,29 @@ | ||||
| SHORT PCI bandwidth in MBytes/s | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  BUS_READ_TOTAL_PCI | ||||
| PMC3  BUS_WRITE_TOTAL_PCI | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| CPI  PMC1/PMC0 | ||||
| PCI read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time | ||||
| PCI read data volume [GBytes] 1.0E-09*(PMC2)*256.0 | ||||
| PCI write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time | ||||
| PCI write data volume [GBytes] 1.0E-09*(PMC3)*256.0 | ||||
| PCI bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time | ||||
| PCI data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| PCI read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_PCI)*256.0/runtime | ||||
| PCI read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_PCI)*256.0 | ||||
| PCI write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_PCI)*256.0/runtime | ||||
| PCI write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_PCI)*256.0 | ||||
| PCI bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_PCI+BUS_WRITE_TOTAL_PCI)*256.0/runtime | ||||
| PCI data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_PCI+BUS_WRITE_TOTAL_PCI)*256.0 | ||||
| - | ||||
| Profiling group to measure PCI bandwidth. The cache line size is 256 Byte. | ||||
							
								
								
									
										29
									
								
								collectors/likwid/groups/arm64fx/TOFU.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								collectors/likwid/groups/arm64fx/TOFU.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,29 @@ | ||||
| SHORT TOFU bandwidth in MBytes/s | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  BUS_READ_TOTAL_TOFU | ||||
| PMC3  BUS_WRITE_TOTAL_TOFU | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| CPI  PMC1/PMC0 | ||||
| TOFU read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time | ||||
| TOFU read data volume [GBytes] 1.0E-09*(PMC2)*256.0 | ||||
| TOFU write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time | ||||
| TOFU write data volume [GBytes] 1.0E-09*(PMC3)*256.0 | ||||
| TOFU bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time | ||||
| TOFU data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| TOFU read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_TOFU)*256.0/runtime | ||||
| TOFU read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_TOFU)*256.0 | ||||
| TOFU write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_TOFU)*256.0/runtime | ||||
| TOFU write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_TOFU)*256.0 | ||||
| TOFU bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_TOFU+BUS_WRITE_TOTAL_TOFU)*256.0/runtime | ||||
| TOFU data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_TOFU+BUS_WRITE_TOTAL_TOFU)*256.0 | ||||
| - | ||||
| Profiling group to measure TOFU bandwidth. The cache line size is 256 Byte. | ||||
							
								
								
									
										31
									
								
								collectors/likwid/groups/arm8/BRANCH.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										31
									
								
								collectors/likwid/groups/arm8/BRANCH.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,31 @@ | ||||
| SHORT Branch prediction miss rate/ratio | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  BR_PRED | ||||
| PMC3  BR_MIS_PRED | ||||
| PMC4  INST_SPEC | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| CPI  PMC1/PMC0 | ||||
| Branch rate   PMC2/PMC0 | ||||
| Branch misprediction rate  PMC3/PMC0 | ||||
| Branch misprediction ratio  PMC3/(PMC2+PMC3) | ||||
| Instructions per branch  PMC0/(PMC2+PMC3) | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| CPI = CPU_CYCLES/INST_RETIRED | ||||
| Branch rate = BR_PRED/INST_RETIRED | ||||
| Branch misprediction rate =  BR_MIS_PRED/INST_RETIRED | ||||
| Branch misprediction ratio = BR_MIS_PRED/(BR_PRED+BR_MIS_PRED) | ||||
| Instructions per branch = INSTR_RETIRED_ANY/(BR_PRED+BR_MIS_PRED) | ||||
| - | ||||
| The rates state how often in average a branch or a mispredicted branch occured | ||||
| per instruction retired in total. The Branch misprediction ratio sets directly | ||||
| into relation what ratio of all branch instruction where mispredicted. | ||||
| Instructions per branch is 1/Branch rate. | ||||
|  | ||||
							
								
								
									
										24
									
								
								collectors/likwid/groups/arm8/DATA.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								collectors/likwid/groups/arm8/DATA.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,24 @@ | ||||
| SHORT Load to store ratio | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  LD_RETIRED | ||||
| PMC3  ST_RETIRED | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| CPI  PMC1/PMC0 | ||||
| Load to store ratio PMC2/PMC3 | ||||
| Load ratio PMC2/PMC0 | ||||
| Store ratio PMC3/PMC0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| CPI = CPU_CYCLES/INST_RETIRED | ||||
| Load to store ratio = LD_RETIRED / ST_RETIRED | ||||
| Load ratio = LD_RETIRED / INST_RETIRED | ||||
| Store ratio = ST_RETIRED / INST_RETIRED | ||||
| - | ||||
| This is a metric to determine your load to store ratio. | ||||
|  | ||||
							
								
								
									
										24
									
								
								collectors/likwid/groups/arm8/ICACHE.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								collectors/likwid/groups/arm8/ICACHE.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,24 @@ | ||||
| SHORT  Instruction cache miss rate/ratio | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  L1I_CACHE | ||||
| PMC3  L1I_CACHE_REFILL | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| CPI  PMC1/PMC0 | ||||
| L1I request rate PMC2/PMC0 | ||||
| L1I miss rate PMC3/PMC0 | ||||
| L1I miss ratio PMC3/PMC2 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| CPI = CPU_CYCLES/INST_RETIRED | ||||
| L1I request rate = L1I_CACHE / INST_RETIRED | ||||
| L1I miss rate = L1I_CACHE_REFILL / INST_RETIRED | ||||
| L1I miss ratio = L1I_CACHE_REFILL / L1I_CACHE | ||||
| - | ||||
| This group measures some L1 instruction cache metrics. | ||||
							
								
								
									
										40
									
								
								collectors/likwid/groups/arm8/L2.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										40
									
								
								collectors/likwid/groups/arm8/L2.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,40 @@ | ||||
| SHORT  L2 cache bandwidth in MBytes/s | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  L1D_CACHE_REFILL | ||||
| PMC3  L1D_CACHE_WB | ||||
| PMC4  L1I_CACHE_REFILL | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| CPI  PMC1/PMC0 | ||||
| L2D load bandwidth [MBytes/s]  1.0E-06*PMC2*64.0/time | ||||
| L2D load data volume [GBytes]  1.0E-09*PMC2*64.0 | ||||
| L2D evict bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time | ||||
| L2D evict data volume [GBytes]  1.0E-09*PMC3*64.0 | ||||
| L2I load bandwidth [MBytes/s]  1.0E-06*PMC4*64.0/time | ||||
| L2I load data volume [GBytes]  1.0E-09*PMC4*64.0 | ||||
| L2 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3+PMC4)*64.0/time | ||||
| L2 data volume [GBytes] 1.0E-09*(PMC2+PMC3+PMC4)*64.0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| CPI = CPU_CYCLES/INST_RETIRED | ||||
| L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_REFILL*64.0/time | ||||
| L2D load data volume [GBytes] = 1.0E-09*L1D_CACHE_REFILL*64.0 | ||||
| L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_WB*64.0/time | ||||
| L2D evict data volume [GBytes] = 1.0E-09*L1D_CACHE_WB*64.0 | ||||
| L2I load bandwidth [MBytes/s] = 1.0E-06*L1I_CACHE_REFILL*64.0/time | ||||
| L2I load data volume [GBytes] = 1.0E-09*L1I_CACHE_REFILL*64.0 | ||||
| L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0/time | ||||
| L2 data volume [GBytes] = 1.0E-09*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0 | ||||
| - | ||||
| Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the | ||||
| number of cacheline loaded from the L2 to the L1 data cache and the writebacks from | ||||
| the L1 data cache to the L2 cache. The group also outputs total data volume transfered between | ||||
| L2 and L1. Note that this bandwidth also includes data transfers due to a write | ||||
| allocate load on a store miss in L1 and cachelines transfered it the instruction | ||||
| cache. | ||||
							
								
								
									
										30
									
								
								collectors/likwid/groups/arm8/MEM.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								collectors/likwid/groups/arm8/MEM.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,30 @@ | ||||
| SHORT Main memory bandwidth in MBytes/s | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  L2D_CACHE_REFILL | ||||
| PMC3  L2D_CACHE_WB | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| CPI  PMC1/PMC0 | ||||
| Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*64.0/time | ||||
| Memory read data volume [GBytes] 1.0E-09*(PMC2)*64.0 | ||||
| Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*64.0/time | ||||
| Memory write data volume [GBytes] 1.0E-09*(PMC3)*64.0 | ||||
| Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time | ||||
| Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Memory read bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL)*64.0/runtime | ||||
| Memory read data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL)*64.0 | ||||
| Memory write bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_WB)*64.0/runtime | ||||
| Memory write data volume [GBytes] = 1.0E-09*(L2D_CACHE_WB)*64.0 | ||||
| Memory bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL+L2D_CACHE_WB)*64.0/runtime | ||||
| Memory data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL+L2D_CACHE_WB)*64.0 | ||||
| - | ||||
| Profiling group to measure memory bandwidth as initiated by the L2 cache. | ||||
|  | ||||
							
								
								
									
										31
									
								
								collectors/likwid/groups/arm8_n1/BRANCH.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										31
									
								
								collectors/likwid/groups/arm8_n1/BRANCH.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,31 @@ | ||||
| SHORT Branch prediction miss rate/ratio | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  BR_PRED | ||||
| PMC3  BR_MIS_PRED | ||||
| PMC4  INST_SPEC | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| CPI  PMC1/PMC0 | ||||
| Branch rate   PMC2/PMC0 | ||||
| Branch misprediction rate  PMC3/PMC0 | ||||
| Branch misprediction ratio  PMC3/(PMC2+PMC3) | ||||
| Instructions per branch  PMC0/(PMC2+PMC3) | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| CPI = CPU_CYCLES/INST_RETIRED | ||||
| Branch rate = BR_PRED/INST_RETIRED | ||||
| Branch misprediction rate =  BR_MIS_PRED/INST_RETIRED | ||||
| Branch misprediction ratio = BR_MIS_PRED/(BR_PRED+BR_MIS_PRED) | ||||
| Instructions per branch = INSTR_RETIRED_ANY/(BR_PRED+BR_MIS_PRED) | ||||
| - | ||||
| The rates state how often in average a branch or a mispredicted branch occured | ||||
| per instruction retired in total. The Branch misprediction ratio sets directly | ||||
| into relation what ratio of all branch instruction where mispredicted. | ||||
| Instructions per branch is 1/Branch rate. | ||||
|  | ||||
							
								
								
									
										16
									
								
								collectors/likwid/groups/arm8_n1/CLOCK.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								collectors/likwid/groups/arm8_n1/CLOCK.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,16 @@ | ||||
| SHORT Cycles and instructions | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| CPI  PMC1/PMC0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| CPI = CPU_CYCLES/INST_RETIRED | ||||
| - | ||||
| This is a metric to determine cycles per instruction. | ||||
|  | ||||
							
								
								
									
										24
									
								
								collectors/likwid/groups/arm8_n1/DATA.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								collectors/likwid/groups/arm8_n1/DATA.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,24 @@ | ||||
| SHORT Load to store ratio | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  LD_SPEC | ||||
| PMC3  ST_SPEC | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| CPI  PMC1/PMC0 | ||||
| Load to store ratio PMC2/PMC3 | ||||
| Load ratio PMC2/PMC0 | ||||
| Store ratio PMC3/PMC0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| CPI = CPU_CYCLES/INST_RETIRED | ||||
| Load to store ratio = LD_SPEC / ST_SPEC | ||||
| Load ratio = LD_SPEC / INST_SPEC | ||||
| Store ratio = ST_SPEC / INST_SPEC | ||||
| - | ||||
| This is a metric to determine your load to store ratio. | ||||
|  | ||||
							
								
								
									
										24
									
								
								collectors/likwid/groups/arm8_n1/ICACHE.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								collectors/likwid/groups/arm8_n1/ICACHE.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,24 @@ | ||||
| SHORT  Instruction cache miss rate/ratio | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  L1I_CACHE | ||||
| PMC3  L1I_CACHE_REFILL | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| CPI  PMC1/PMC0 | ||||
| L1I request rate PMC2/PMC0 | ||||
| L1I miss rate PMC3/PMC0 | ||||
| L1I miss ratio PMC3/PMC2 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| CPI = CPU_CYCLES/INST_RETIRED | ||||
| L1I request rate = L1I_CACHE / INST_RETIRED | ||||
| L1I miss rate = L1I_CACHE_REFILL / INST_RETIRED | ||||
| L1I miss ratio = L1I_CACHE_REFILL / L1I_CACHE | ||||
| - | ||||
| This group measures some L1 instruction cache metrics. | ||||
							
								
								
									
										40
									
								
								collectors/likwid/groups/arm8_n1/L2.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										40
									
								
								collectors/likwid/groups/arm8_n1/L2.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,40 @@ | ||||
| SHORT  L2 cache bandwidth in MBytes/s | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  L1D_CACHE_REFILL | ||||
| PMC3  L1D_CACHE_WB | ||||
| PMC4  L1I_CACHE_REFILL | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| CPI  PMC1/PMC0 | ||||
| L2D load bandwidth [MBytes/s]  1.0E-06*PMC2*64.0/time | ||||
| L2D load data volume [GBytes]  1.0E-09*PMC2*64.0 | ||||
| L2D evict bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time | ||||
| L2D evict data volume [GBytes]  1.0E-09*PMC3*64.0 | ||||
| L2I load bandwidth [MBytes/s]  1.0E-06*PMC4*64.0/time | ||||
| L2I load data volume [GBytes]  1.0E-09*PMC4*64.0 | ||||
| L2 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3+PMC4)*64.0/time | ||||
| L2 data volume [GBytes] 1.0E-09*(PMC2+PMC3+PMC4)*64.0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| CPI = CPU_CYCLES/INST_RETIRED | ||||
| L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_REFILL*64.0/time | ||||
| L2D load data volume [GBytes] = 1.0E-09*L1D_CACHE_REFILL*64.0 | ||||
| L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_WB*64.0/time | ||||
| L2D evict data volume [GBytes] = 1.0E-09*L1D_CACHE_WB*64.0 | ||||
| L2I load bandwidth [MBytes/s] = 1.0E-06*L1I_CACHE_REFILL*64.0/time | ||||
| L2I load data volume [GBytes] = 1.0E-09*L1I_CACHE_REFILL*64.0 | ||||
| L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0/time | ||||
| L2 data volume [GBytes] = 1.0E-09*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0 | ||||
| - | ||||
| Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the | ||||
| number of cacheline loaded from the L2 to the L1 data cache and the writebacks from | ||||
| the L1 data cache to the L2 cache. The group also outputs total data volume transfered between | ||||
| L2 and L1. Note that this bandwidth also includes data transfers due to a write | ||||
| allocate load on a store miss in L1 and cachelines transfered it the instruction | ||||
| cache. | ||||
							
								
								
									
										30
									
								
								collectors/likwid/groups/arm8_n1/L3.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								collectors/likwid/groups/arm8_n1/L3.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,30 @@ | ||||
| SHORT L3 cache bandwidth in MBytes/s  | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  L2D_CACHE_REFILL | ||||
| PMC3  L2D_CACHE_WB | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| CPI  PMC1/PMC0 | ||||
| L3 read bandwidth [MBytes/s] 1.0E-06*(PMC2)*64.0/time | ||||
| L3 read data volume [GBytes] 1.0E-09*(PMC2)*64.0 | ||||
| L3 write bandwidth [MBytes/s] 1.0E-06*(PMC3)*64.0/time | ||||
| L3 write data volume [GBytes] 1.0E-09*(PMC3)*64.0 | ||||
| L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time | ||||
| L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| L3 read bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL)*64.0/runtime | ||||
| L3 read data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL)*64.0 | ||||
| L3 write bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_WB)*64.0/runtime | ||||
| L3 write data volume [GBytes] = 1.0E-09*(L2D_CACHE_WB)*64.0 | ||||
| L3 bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL+L2D_CACHE_WB)*64.0/runtime | ||||
| L3 data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL+L2D_CACHE_WB)*64.0 | ||||
| - | ||||
| Profiling group to measure traffic between L2 and L3 cache. | ||||
|  | ||||
							
								
								
									
										29
									
								
								collectors/likwid/groups/arm8_n1/MEM.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								collectors/likwid/groups/arm8_n1/MEM.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,29 @@ | ||||
| SHORT Main memory bandwidth in MBytes/s | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  MEM_ACCESS_RD | ||||
| PMC3  MEM_ACCESS_WR | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| CPI  PMC1/PMC0 | ||||
| Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*64.0/time | ||||
| Memory read data volume [GBytes] 1.0E-09*(PMC2)*64.0 | ||||
| Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*64.0/time | ||||
| Memory write data volume [GBytes] 1.0E-09*(PMC3)*64.0 | ||||
| Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time | ||||
| Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Memory read bandwidth [MBytes/s] = 1.0E-06*(MEM_ACCESS_RD)*64.0/runtime | ||||
| Memory read data volume [GBytes] = 1.0E-09*(MEM_ACCESS_RD)*64.0 | ||||
| Memory write bandwidth [MBytes/s] = 1.0E-06*(MEM_ACCESS_WR)*64.0/runtime | ||||
| Memory write data volume [GBytes] = 1.0E-09*(MEM_ACCESS_WR)*64.0 | ||||
| Memory bandwidth [MBytes/s] = 1.0E-06*(MEM_ACCESS_RD+MEM_ACCESS_WR)*64.0/runtime | ||||
| Memory data volume [GBytes] = 1.0E-09*(MEM_ACCESS_RD+MEM_ACCESS_WR)*64.0 | ||||
| - | ||||
| Profiling group to measure memory bandwidth | ||||
|  | ||||
							
								
								
									
										30
									
								
								collectors/likwid/groups/arm8_n1/TLB.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								collectors/likwid/groups/arm8_n1/TLB.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,30 @@ | ||||
| SHORT L1/L2 TLB information  | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  L1D_TLB | ||||
| PMC1  L1I_TLB | ||||
| PMC2  L2D_TLB | ||||
| PMC3  L1D_TLB_REFILL | ||||
| PMC4  L1I_TLB_REFILL | ||||
| PMC5  L2D_TLB_REFILL | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| L1 DTLB accesses PMC0 | ||||
| L1 ITLB accesses PMC1 | ||||
| L2 DTLB accesses PMC2 | ||||
| L1 DTLB refills PMC3 | ||||
| L1 ITLB refills PMC4 | ||||
| L2 DTLB refills PMC5 | ||||
| L1 DTLB refill ratio PMC3/PMC0 | ||||
| L1 ITLB refill ratio PMC4/PMC1 | ||||
| L1 DTLB refill ratio PMC5/PMC2 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| L1 DTLB refill ratio = L1D_TLB_REFILL / L1D_TLB | ||||
| L1 ITLB refill ratio = L1I_TLB_REFILL / L1I_TLB | ||||
| L2 DTLB refill ratio = L2D_TLB_REFILL / L2D_TLB | ||||
| - | ||||
| This group gives information about the TLB usage for all TLBs: | ||||
| L1 data, L1 instruction and L2 data. | ||||
							
								
								
									
										32
									
								
								collectors/likwid/groups/arm8_tx2/BRANCH.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										32
									
								
								collectors/likwid/groups/arm8_tx2/BRANCH.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,32 @@ | ||||
| SHORT Branch prediction miss rate/ratio | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  BR_PRED | ||||
| PMC3  BR_MIS_PRED | ||||
| PMC4  INST_SPEC | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Clock [MHz] 1.E-06*PMC1/time | ||||
| CPI  PMC1/PMC0 | ||||
| Branch rate   PMC2/PMC0 | ||||
| Branch misprediction rate  PMC3/PMC0 | ||||
| Branch misprediction ratio  PMC3/(PMC2+PMC3) | ||||
| Instructions per branch  PMC0/(PMC2+PMC3) | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| CPI = CPU_CYCLES/INST_RETIRED | ||||
| Branch rate = BR_PRED/INST_RETIRED | ||||
| Branch misprediction rate =  BR_MIS_PRED/INST_RETIRED | ||||
| Branch misprediction ratio = BR_MIS_PRED/(BR_PRED+BR_MIS_PRED) | ||||
| Instructions per branch = INSTR_RETIRED_ANY/(BR_PRED+BR_MIS_PRED) | ||||
| - | ||||
| The rates state how often in average a branch or a mispredicted branch occured | ||||
| per instruction retired in total. The Branch misprediction ratio sets directly | ||||
| into relation what ratio of all branch instruction where mispredicted. | ||||
| Instructions per branch is 1/Branch rate. | ||||
|  | ||||
							
								
								
									
										25
									
								
								collectors/likwid/groups/arm8_tx2/DATA.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								collectors/likwid/groups/arm8_tx2/DATA.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,25 @@ | ||||
| SHORT Load to store ratio | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  LD_RETIRED | ||||
| PMC3  ST_RETIRED | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Clock [MHz] 1.E-06*PMC1/time | ||||
| CPI  PMC1/PMC0 | ||||
| Load to store ratio PMC2/PMC3 | ||||
| Load ratio PMC2/PMC0 | ||||
| Store ratio PMC3/PMC0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| CPI = CPU_CYCLES/INST_RETIRED | ||||
| Load to store ratio = LD_RETIRED / ST_RETIRED | ||||
| Load ratio = LD_RETIRED / INST_RETIRED | ||||
| Store ratio = ST_RETIRED / INST_RETIRED | ||||
| - | ||||
| This is a metric to determine your load to store ratio. | ||||
|  | ||||
							
								
								
									
										28
									
								
								collectors/likwid/groups/arm8_tx2/FLOPS_DP.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										28
									
								
								collectors/likwid/groups/arm8_tx2/FLOPS_DP.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,28 @@ | ||||
| SHORT Double Precision MFLOP/s | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  VFP_SPEC | ||||
| PMC3  ASE_SPEC | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Clock [MHz] 1.E-06*PMC1/time | ||||
| CPI  PMC1/PMC0 | ||||
| DP [MFLOP/s]  1.0E-06*(PMC3*2.0+PMC2)/time | ||||
| NEON DP [MFLOP/s]  1.0E-06*(PMC3*2.0)/time | ||||
| Packed [MUOPS/s]   1.0E-06*(PMC3)/time | ||||
| Scalar [MUOPS/s] 1.0E-06*PMC2/time | ||||
| Vectorization ratio 100*(PMC3)/(PMC2+PMC3) | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| DP [MFLOP/s] = 1.0E-06*(ASE_SPEC*2+VFP_SPEC)/runtime | ||||
| NEON DP [MFLOP/s] = 1.0E-06*(ASE_SPEC*4)/runtime | ||||
| Packed [MUOPS/s] = 1.0E-06*(ASE_SPEC)/runtime | ||||
| Scalar [MUOPS/s] = 1.0E-06*VFP_SPEC/runtime | ||||
| Vectorization ratio = 100*(ASE_SPEC)/(ASE_SPEC+VFP_SPEC) | ||||
| - | ||||
| NEON scalar and packed double precision FLOP rates. | ||||
|  | ||||
							
								
								
									
										28
									
								
								collectors/likwid/groups/arm8_tx2/FLOPS_SP.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										28
									
								
								collectors/likwid/groups/arm8_tx2/FLOPS_SP.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,28 @@ | ||||
| SHORT Single Precision MFLOP/s | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  VFP_SPEC | ||||
| PMC3  ASE_SPEC | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Clock [MHz] 1.E-06*PMC1/time | ||||
| CPI  PMC1/PMC0 | ||||
| SP [MFLOP/s]  1.0E-06*(PMC3*2.0+PMC2)/time | ||||
| NEON SP [MFLOP/s]  1.0E-06*(PMC3*2.0)/time | ||||
| Packed [MUOPS/s]   1.0E-06*(PMC3)/time | ||||
| Scalar [MUOPS/s] 1.0E-06*PMC2/time | ||||
| Vectorization ratio 100*(PMC3)/(PMC2+PMC3) | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| SP [MFLOP/s] = 1.0E-06*(ASE_SPEC*2+VFP_SPEC)/runtime | ||||
| NEON SP [MFLOP/s] = 1.0E-06*(ASE_SPEC*4)/runtime | ||||
| Packed [MUOPS/s] = 1.0E-06*(ASE_SPEC)/runtime | ||||
| Scalar [MUOPS/s] = 1.0E-06*VFP_SPEC/runtime | ||||
| Vectorization ratio = 100*(ASE_SPEC)/(ASE_SPEC+VFP_SPEC) | ||||
| - | ||||
| NEON scalar and packed single precision FLOP rates. | ||||
|  | ||||
							
								
								
									
										23
									
								
								collectors/likwid/groups/arm8_tx2/ICACHE.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								collectors/likwid/groups/arm8_tx2/ICACHE.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,23 @@ | ||||
| SHORT  Instruction cache miss rate/ratio | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  L1I_CACHE | ||||
| PMC3  L1I_CACHE_REFILL | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Clock [MHz] 1.E-06*PMC1/time | ||||
| CPI  PMC1/PMC0 | ||||
| L1I request rate PMC2/PMC0 | ||||
| L1I miss rate PMC3/PMC0 | ||||
| L1I miss ratio PMC3/PMC2 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| L1I request rate = L1I_CACHE / INST_RETIRED | ||||
| L1I miss rate = L1I_CACHE_REFILL / INST_RETIRED | ||||
| L1I miss ratio = L1I_CACHE_REFILL / L1I_CACHE | ||||
| - | ||||
| This group measures some L1 instruction cache metrics. | ||||
							
								
								
									
										41
									
								
								collectors/likwid/groups/arm8_tx2/L2.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										41
									
								
								collectors/likwid/groups/arm8_tx2/L2.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,41 @@ | ||||
| SHORT  L2 cache bandwidth in MBytes/s | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  L1D_CACHE_REFILL | ||||
| PMC3  L1D_CACHE_WB | ||||
| PMC4  L1I_CACHE_REFILL | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Clock [MHz] 1.E-06*PMC1/time | ||||
| CPI  PMC1/PMC0 | ||||
| L2D load bandwidth [MBytes/s]  1.0E-06*PMC2*64.0/time | ||||
| L2D load data volume [GBytes]  1.0E-09*PMC2*64.0 | ||||
| L2D evict bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time | ||||
| L2D evict data volume [GBytes]  1.0E-09*PMC3*64.0 | ||||
| L2I load bandwidth [MBytes/s]  1.0E-06*PMC4*64.0/time | ||||
| L2I load data volume [GBytes]  1.0E-09*PMC4*64.0 | ||||
| L2 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3+PMC4)*64.0/time | ||||
| L2 data volume [GBytes] 1.0E-09*(PMC2+PMC3+PMC4)*64.0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| CPI = CPU_CYCLES/INST_RETIRED | ||||
| L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_REFILL*64.0/time | ||||
| L2D load data volume [GBytes] = 1.0E-09*L1D_CACHE_REFILL*64.0 | ||||
| L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_WB*64.0/time | ||||
| L2D evict data volume [GBytes] = 1.0E-09*L1D_CACHE_WB*64.0 | ||||
| L2I load bandwidth [MBytes/s] = 1.0E-06*L1I_CACHE_REFILL*64.0/time | ||||
| L2I load data volume [GBytes] = 1.0E-09*L1I_CACHE_REFILL*64.0 | ||||
| L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0/time | ||||
| L2 data volume [GBytes] = 1.0E-09*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0 | ||||
| - | ||||
| Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the | ||||
| number of cacheline loaded from the L2 to the L1 data cache and the writebacks from | ||||
| the L1 data cache to the L2 cache. The group also outputs total data volume transfered between | ||||
| L2 and L1. Note that this bandwidth also includes data transfers due to a write | ||||
| allocate load on a store miss in L1 and cachelines transfered it the instruction | ||||
| cache. | ||||
							
								
								
									
										32
									
								
								collectors/likwid/groups/arm8_tx2/L2CACHE.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										32
									
								
								collectors/likwid/groups/arm8_tx2/L2CACHE.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,32 @@ | ||||
| SHORT L2 cache miss rate/ratio | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  L2D_CACHE | ||||
| PMC3  L2D_CACHE_REFILL | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Clock [MHz] 1.E-06*PMC1/time | ||||
| CPI  PMC1/PMC0 | ||||
| L2 request rate PMC2/PMC0 | ||||
| L2 miss rate PMC3/PMC0 | ||||
| L2 miss ratio PMC3/PMC2 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| L2 request rate = L2D_CACHE/INST_RETIRED | ||||
| L2 miss rate = L2D_CACHE_REFILL/INST_RETIRED | ||||
| L2 miss ratio = L2D_CACHE_REFILL/L2D_CACHE | ||||
| - | ||||
| This group measures the locality of your data accesses with regard to the | ||||
| L2 cache. L2 request rate tells you how data intensive your code is | ||||
| or how many data accesses you have on average per instruction. | ||||
| The L2 miss rate gives a measure how often it was necessary to get | ||||
| cache lines from memory. And finally L2 miss ratio tells you how many of your | ||||
| memory references required a cache line to be loaded from a higher level. | ||||
| While the data cache miss rate might be given by your algorithm you should | ||||
| try to get data cache miss ratio as low as possible by increasing your cache reuse. | ||||
|  | ||||
|  | ||||
							
								
								
									
										38
									
								
								collectors/likwid/groups/arm8_tx2/L3.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										38
									
								
								collectors/likwid/groups/arm8_tx2/L3.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,38 @@ | ||||
| SHORT  L3 cache bandwidth in MBytes/s | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  L2D_CACHE_REFILL | ||||
| PMC3  L2D_CACHE_WB | ||||
| PMC4  L2D_CACHE_ALLOCATE | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Clock [MHz] 1.E-06*PMC1/time | ||||
| CPI  PMC1/PMC0 | ||||
| L3 load bandwidth [MBytes/s]  1.0E-06*(PMC2-PMC4)*64.0/time | ||||
| L3 load data volume [GBytes]  1.0E-09*(PMC2-PMC4)*64.0 | ||||
| L3 evict bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time | ||||
| L3 evict data volume [GBytes]  1.0E-09*PMC3*64.0 | ||||
| L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3-PMC4)*64.0/time | ||||
| L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3-PMC4)*64.0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| CPI = CPU_CYCLES/INST_RETIRED | ||||
| L3 load bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL-L2D_CACHE_ALLOCATE)*64.0/time | ||||
| L3 load data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL-L2D_CACHE_ALLOCATE)*64.0 | ||||
| L3 evict bandwidth [MBytes/s] = 1.0E-06*L2D_CACHE_WB*64.0/time | ||||
| L3 evict data volume [GBytes] = 1.0E-09*L2D_CACHE_WB*64.0 | ||||
| L3 bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL+L2D_CACHE_WB-L2D_CACHE_ALLOCATE))*64.0/time | ||||
| L3 data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL+L2D_CACHE_WB-L2D_CACHE_ALLOCATE))*64.0 | ||||
| - | ||||
| Profiling group to measure L2 <-> L3 cache bandwidth. The bandwidth is computed by the | ||||
| number of cache lines loaded from the L3 to the L2 data cache and the writebacks from | ||||
| the L2 data cache to the L3 cache. The group also outputs total data volume transfered between | ||||
| L3 and L2. For streaming-stores, the cache lines are allocated in L2, consequently there | ||||
| is no traffic between L3 and L2 in this case. But the L2D_CACHE_REFILL event counts these | ||||
| allocated cache lines, that's why the value of L2D_CACHE_REFILL is reduced | ||||
| by L2D_CACHE_ALLOCATE. | ||||
							
								
								
									
										32
									
								
								collectors/likwid/groups/arm8_tx2/MEM.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										32
									
								
								collectors/likwid/groups/arm8_tx2/MEM.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,32 @@ | ||||
| SHORT Main memory bandwidth in MBytes/s | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| MBOX0C0  MEMORY_READS | ||||
| MBOX0C1  MEMORY_WRITES | ||||
| MBOX1C0  MEMORY_READS | ||||
| MBOX1C1  MEMORY_WRITES | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Clock [MHz] 1.E-06*PMC1/time | ||||
| CPI  PMC1/PMC0 | ||||
| Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0)*64.0/time | ||||
| Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0)*64.0 | ||||
| Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1)*64.0/time | ||||
| Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1)*64.0 | ||||
| Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1)*64.0/time | ||||
| Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1)*64.0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MEMORY_READS))*64.0/runtime | ||||
| Memory read data volume [GBytes] = 1.0E-09*(SUM(MEMORY_READS))*64.0 | ||||
| Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MEMORY_WRITES))*64.0/runtime | ||||
| Memory write data volume [GBytes] = 1.0E-09*(SUM(MEMORY_WRITES))*64.0 | ||||
| Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MEMORY_READS)+SUM(MEMORY_WRITES))*64.0/runtime | ||||
| Memory data volume [GBytes] = 1.0E-09*(SUM(MEMORY_READS)+SUM(MEMORY_WRITES))*64.0 | ||||
| - | ||||
| Profiling group to measure memory bandwidth. It uses the performance monitoring | ||||
| hardware of the memory controllers. | ||||
							
								
								
									
										44
									
								
								collectors/likwid/groups/arm8_tx2/SPEC.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										44
									
								
								collectors/likwid/groups/arm8_tx2/SPEC.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,44 @@ | ||||
| SHORT Information about speculative execution | ||||
|  | ||||
| EVENTSET | ||||
| PMC0 INST_SPEC | ||||
| PMC1 LD_SPEC | ||||
| PMC2 ST_SPEC | ||||
| PMC3 DP_SPEC | ||||
| PMC4 VFP_SPEC | ||||
| PMC5 ASE_SPEC | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Operations spec. executed PMC0 | ||||
| Load ops spec. executed PMC1 | ||||
| Store ops spec. executed PMC2 | ||||
| Integer data ops spec. executed PMC3 | ||||
| Scalar FP ops spec. executed PMC4 | ||||
| Vector FP ops spec. executed PMC5 | ||||
| Other ops spec. executed (PMC0-PMC1-PMC2-PMC3-PMC4-PMC5) | ||||
| Load ops spec. ratio PMC1/PMC0 | ||||
| Store ops spec. ratio PMC2/PMC0 | ||||
| Integer data ops spec. ratio PMC3/PMC0 | ||||
| Scalar FP ops spec. ratio PMC4/PMC0 | ||||
| Vector FP ops spec. ratio PMC5/PMC0 | ||||
| Other ops spec. ratio (PMC0-PMC1-PMC2-PMC3-PMC4-PMC5)/PMC0 | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Load ops spec. ratio = LD_SPEC / INST_SPEC | ||||
| Store ops spec. ratio = ST_SPEC / INST_SPEC | ||||
| Integer data ops spec. ratio = DP_SPEC / INST_SPEC | ||||
| Scalar FP ops spec. ratio = VFP_SPEC / INST_SPEC | ||||
| Vector FP ops spec. ratio = ASE_SPEC / INST_SPEC | ||||
| Other ops spec. ratio = (INST_SPEC-LD_SPEC-ST_SPEC-DP_SPEC-VFP_SPEC-ASE_SPEC) / INST_SPEC | ||||
| - | ||||
| This group gives information about the speculative execution of micro-ops. | ||||
| It is currently unclear why Other ops spec. executed and ratio is negative | ||||
| in some cases. Although the documentation contains an OP_RETIRED, there is no | ||||
| equivalent OP_SPEC which could be a better reference in this group instead of | ||||
| INST_SPEC. | ||||
							
								
								
									
										27
									
								
								collectors/likwid/groups/arm8_tx2/TLB_DATA.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										27
									
								
								collectors/likwid/groups/arm8_tx2/TLB_DATA.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,27 @@ | ||||
| SHORT  L1 data TLB miss rate/ratio | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  L1D_TLB_REFILL_RD | ||||
| PMC3  L1D_TLB_REFILL_WR | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Clock [MHz] 1.E-06*PMC1/time | ||||
| CPI  PMC1/PMC0 | ||||
| L1 DTLB load misses     PMC2 | ||||
| L1 DTLB load miss rate  PMC2/PMC0 | ||||
| L1 DTLB store misses     PMC3 | ||||
| L1 DTLB store miss rate  PMC3/PMC0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| L1 DTLB load misses = L1D_TLB_REFILL_RD | ||||
| L1 DTLB load miss rate = L1D_TLB_REFILL_RD / INST_RETIRED | ||||
| L1 DTLB store misses = L1D_TLB_REFILL_WR | ||||
| L1 DTLB store miss rate = L1D_TLB_REFILL_WR / INST_RETIRED | ||||
| - | ||||
| The DTLB load and store miss rates gives a measure how often a TLB miss occurred | ||||
| per instruction. | ||||
|  | ||||
							
								
								
									
										23
									
								
								collectors/likwid/groups/arm8_tx2/TLB_INSTR.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								collectors/likwid/groups/arm8_tx2/TLB_INSTR.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,23 @@ | ||||
| SHORT  L1 Instruction TLB miss rate/ratio | ||||
|  | ||||
| EVENTSET | ||||
| PMC0  INST_RETIRED | ||||
| PMC1  CPU_CYCLES | ||||
| PMC2  L1I_TLB_REFILL | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Clock [MHz] 1.E-06*PMC1/time | ||||
| CPI  PMC1/PMC0 | ||||
| L1 ITLB misses     PMC2 | ||||
| L1 ITLB miss rate  PMC2/PMC0 | ||||
|  | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| L1 ITLB misses = L1I_TLB_REFILL | ||||
| L1 ITLB miss rate = L1I_TLB_REFILL / INST_RETIRED | ||||
| - | ||||
| The ITLB miss rates gives a measure how often a TLB miss occurred | ||||
| per instruction. | ||||
|  | ||||
							
								
								
									
										29
									
								
								collectors/likwid/groups/atom/BRANCH.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								collectors/likwid/groups/atom/BRANCH.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,29 @@ | ||||
| SHORT Branch prediction miss rate/ratio | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| PMC0  BR_INST_RETIRED_ANY | ||||
| PMC1  BR_INST_RETIRED_MISPRED | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Branch rate   PMC0/FIXC0 | ||||
| Branch misprediction rate  PMC1/FIXC0 | ||||
| Branch misprediction ratio  PMC1/PMC0 | ||||
| Instructions per branch  FIXC0/PMC0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Branch rate = BR_INST_RETIRED_ANY/INSTR_RETIRED_ANY | ||||
| Branch misprediction rate = BR_INST_RETIRED_MISPRED/INSTR_RETIRED_ANY | ||||
| Branch misprediction ratio = BR_INST_RETIRED_MISPRED/BR_INST_RETIRED_ANY | ||||
| Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ANY | ||||
| - | ||||
| The rates state how often on average a branch or a mispredicted branch occurred | ||||
| per instruction retired in total. The branch misprediction ratio sets directly | ||||
| into relation what ratio of all branch instruction where mispredicted. | ||||
| Instructions per branch is 1/branch rate. | ||||
|  | ||||
							
								
								
									
										20
									
								
								collectors/likwid/groups/atom/DATA.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								collectors/likwid/groups/atom/DATA.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,20 @@ | ||||
| SHORT Load to store ratio | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| PMC0  L1D_CACHE_LD | ||||
| PMC1  L1D_CACHE_ST | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Load to store ratio PMC0/PMC1 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Load to store ratio = L1D_CACHE_LD/L1D_CACHE_ST | ||||
| - | ||||
| This is a simple metric to determine your load to store ratio. | ||||
|  | ||||
							
								
								
									
										25
									
								
								collectors/likwid/groups/atom/FLOPS_DP.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								collectors/likwid/groups/atom/FLOPS_DP.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,25 @@ | ||||
| SHORT Double Precision MFLOP/s | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| PMC0  SIMD_COMP_INST_RETIRED_PACKED_DOUBLE | ||||
| PMC1  SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| DP [MFLOP/s]    1.0E-06*(PMC0*2.0+PMC1)/time | ||||
| Packed [MUOPS/s]   1.0E-06*PMC0/time | ||||
| Scalar [MUOPS/s] 1.0E-06*PMC1/time | ||||
|  | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| DP [MFLOP/s] = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_DOUBLE*2.0+SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE)/runtime | ||||
| Packed [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_PACKED_DOUBLE/runtime | ||||
| Scalar [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE/runtime | ||||
| -- | ||||
| Double Precision [MFLOP/s] Double Precision MFLOP/s | ||||
|  | ||||
							
								
								
									
										24
									
								
								collectors/likwid/groups/atom/FLOPS_SP.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								collectors/likwid/groups/atom/FLOPS_SP.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,24 @@ | ||||
| SHORT Single Precision MFLOP/s | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| PMC0  SIMD_COMP_INST_RETIRED_PACKED_SINGLE | ||||
| PMC1  SIMD_COMP_INST_RETIRED_SCALAR_SINGLE | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| SP [MFLOP/s] (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time | ||||
| Packed [MUOPS/s]   1.0E-06*(PMC0)/time | ||||
| Scalar [MUOPS/s] 1.0E-06*PMC1/time | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| SP [MFLOP/s] = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_DOUBLE*4.0+SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE)/runtime | ||||
| Packed [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_PACKED_SINGLE/runtime | ||||
| Scalar [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_SCALAR_SINGLE/runtime | ||||
| -- | ||||
| Single Precision MFLOP/s Double Precision MFLOP/s | ||||
|  | ||||
							
								
								
									
										19
									
								
								collectors/likwid/groups/atom/FLOPS_X87.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								collectors/likwid/groups/atom/FLOPS_X87.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,19 @@ | ||||
| SHORT X87 MFLOP/s | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| PMC0  X87_COMP_OPS_EXE_ANY_AR | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| X87 [MFLOP/s]  1.0E-06*PMC0/time | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| X87 [MFLOP/s] = 1.0E-06*X87_COMP_OPS_EXE_ANY_AR/runtime | ||||
| -- | ||||
| The MFLOP/s made with X87 instructions | ||||
|  | ||||
							
								
								
									
										21
									
								
								collectors/likwid/groups/atom/MEM.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								collectors/likwid/groups/atom/MEM.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,21 @@ | ||||
| SHORT Main memory bandwidth in MBytes/s | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| PMC0  BUS_TRANS_MEM_THIS_CORE_THIS_A | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Memory bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time | ||||
| Memory data volume [GBytes] 1.0E-09*PMC0*64.0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Memory bandwidth [MBytes/s] = 1.0E-06*BUS_TRANS_MEM_THIS_CORE_THIS_A*64/time | ||||
| Memory data volume [GBytes] = 1.0E-09*BUS_TRANS_MEM_THIS_CORE_THIS_A*64.0 | ||||
| - | ||||
| Profiling group to measure memory bandwidth drawn by this core. | ||||
|  | ||||
							
								
								
									
										21
									
								
								collectors/likwid/groups/atom/TLB.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								collectors/likwid/groups/atom/TLB.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,21 @@ | ||||
| SHORT TLB miss rate | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| PMC0  DATA_TLB_MISSES_DTLB_MISS | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| DTLB misses       PMC0 | ||||
| DTLB miss rate    PMC0/FIXC0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| DTLB misses = DATA_TLB_MISSES_DTLB_MISS | ||||
| DTLB miss rate = DATA_TLB_MISSES_DTLB_MISS/INSTR_RETIRED_ANY | ||||
| -- | ||||
| The DTLB miss rate gives a measure how often a TLB miss occurred per instruction. | ||||
|  | ||||
							
								
								
									
										31
									
								
								collectors/likwid/groups/broadwell/BRANCH.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										31
									
								
								collectors/likwid/groups/broadwell/BRANCH.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,31 @@ | ||||
| SHORT Branch prediction miss rate/ratio | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  BR_INST_RETIRED_ALL_BRANCHES | ||||
| PMC1  BR_MISP_RETIRED_ALL_BRANCHES | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Branch rate   PMC0/FIXC0 | ||||
| Branch misprediction rate  PMC1/FIXC0 | ||||
| Branch misprediction ratio  PMC1/PMC0 | ||||
| Instructions per branch  FIXC0/PMC0 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY | ||||
| Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY | ||||
| Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES | ||||
| Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES | ||||
| - | ||||
| The rates state how often on average a branch or a mispredicted branch occurred | ||||
| per instruction retired in total. The branch misprediction ratio sets directly | ||||
| into relation what ratio of all branch instruction where mispredicted. | ||||
| Instructions per branch is 1/branch rate. | ||||
|  | ||||
							
								
								
									
										26
									
								
								collectors/likwid/groups/broadwell/CLOCK.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								collectors/likwid/groups/broadwell/CLOCK.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,26 @@ | ||||
| SHORT Power and Energy consumption | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PWR0  PWR_PKG_ENERGY | ||||
| UBOXFIX UNCORE_CLOCK | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| Uncore Clock [MHz] 1.E-06*UBOXFIX/time | ||||
| CPI  FIXC1/FIXC0 | ||||
| Energy [J]  PWR0 | ||||
| Power [W] PWR0/time | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Power =  PWR_PKG_ENERGY / time | ||||
| Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time | ||||
| - | ||||
| Broadwell implements the new RAPL interface. This interface enables to | ||||
| monitor the consumed energy on the package (socket) level. | ||||
|  | ||||
							
								
								
									
										38
									
								
								collectors/likwid/groups/broadwell/CYCLE_ACTIVITY.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										38
									
								
								collectors/likwid/groups/broadwell/CYCLE_ACTIVITY.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,38 @@ | ||||
| SHORT Cycle Activities | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING | ||||
| PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING | ||||
| PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING | ||||
| PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Cycles without execution [%] (PMC3/FIXC1)*100 | ||||
| Cycles without execution due to L1D [%] (PMC2/FIXC1)*100 | ||||
| Cycles without execution due to L2 [%] (PMC0/FIXC1)*100 | ||||
| Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100 | ||||
| Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100 | ||||
| Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100 | ||||
| Cycles without execution due to memory loads [%] = CYCLE_ACTIVITY_CYCLES_LDM_PENDING/CPU_CLK_UNHALTED_CORE*100 | ||||
| -- | ||||
| This performance group measures the cycles while waiting for data from the cache | ||||
| and memory hierarchy. | ||||
| CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on | ||||
| any execution port. | ||||
| CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is | ||||
| outstanding. | ||||
| CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is | ||||
| outstanding. | ||||
| CYCLE_ACTIVITY_CYCLES_LDM_PENDING: Cycles while memory subsystem has an | ||||
| outstanding load. | ||||
							
								
								
									
										45
									
								
								collectors/likwid/groups/broadwell/CYCLE_STALLS.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								collectors/likwid/groups/broadwell/CYCLE_STALLS.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,45 @@ | ||||
| SHORT Cycle Activities (Stalls) | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING | ||||
| PMC1 CYCLE_ACTIVITY_STALLS_LDM_PENDING | ||||
| PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING | ||||
| PMC3 CYCLE_ACTIVITY_STALLS_TOTAL | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Total execution stalls PMC3 | ||||
| Stalls caused by L1D misses [%] (PMC2/PMC3)*100 | ||||
| Stalls caused by L2 misses [%] (PMC0/PMC3)*100 | ||||
| Stalls caused by memory loads [%] (PMC1/PMC3)*100 | ||||
| Execution stall rate [%] (PMC3/FIXC1)*100 | ||||
| Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100 | ||||
| Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100 | ||||
| Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL | ||||
| Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 | ||||
| Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 | ||||
| Stalls caused by memory loads [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100 | ||||
| Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100 | ||||
| Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100 | ||||
| Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100 | ||||
| Stalls caused by memory loads rate [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CPU_CLK_UNHALTED_CORE)*100 | ||||
| -- | ||||
| This performance group measures the stalls caused by data traffic in the cache | ||||
| hierarchy. | ||||
| CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls. | ||||
| CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand | ||||
| load is outstanding. | ||||
| CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand | ||||
| load is outstanding. | ||||
| CYCLE_ACTIVITY_STALLS_LDM_PENDING: Execution stalls while memory subsystem has | ||||
| an outstanding load. | ||||
							
								
								
									
										22
									
								
								collectors/likwid/groups/broadwell/DATA.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								collectors/likwid/groups/broadwell/DATA.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,22 @@ | ||||
| SHORT Load to store ratio | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0  MEM_UOPS_RETIRED_LOADS_ALL | ||||
| PMC1  MEM_UOPS_RETIRED_STORES_ALL | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Load to store ratio PMC0/PMC1 | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Load to store ratio = MEM_UOPS_RETIRED_LOADS_ALL/MEM_UOPS_RETIRED_STORES_ALL | ||||
| - | ||||
| This is a metric to determine your load to store ratio. | ||||
|  | ||||
							
								
								
									
										24
									
								
								collectors/likwid/groups/broadwell/DIVIDE.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								collectors/likwid/groups/broadwell/DIVIDE.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,24 @@ | ||||
| SHORT Divide unit information | ||||
|  | ||||
| EVENTSET | ||||
| FIXC0 INSTR_RETIRED_ANY | ||||
| FIXC1 CPU_CLK_UNHALTED_CORE | ||||
| FIXC2 CPU_CLK_UNHALTED_REF | ||||
| PMC0:EDGEDETECT ARITH_FPU_DIV_ACTIVE | ||||
| PMC1 ARITH_FPU_DIV_ACTIVE | ||||
|  | ||||
|  | ||||
| METRICS | ||||
| Runtime (RDTSC) [s] time | ||||
| Runtime unhalted [s] FIXC1*inverseClock | ||||
| Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock | ||||
| CPI  FIXC1/FIXC0 | ||||
| Number of divide ops PMC0:EDGEDETECT | ||||
| Avg. divide unit usage duration PMC1/PMC0:EDGEDETECT | ||||
|  | ||||
| LONG | ||||
| Formulas: | ||||
| Number of divide ops = ARITH_FPU_DIV_ACTIVE:EDGEDETECT | ||||
| Avg. divide unit usage duration = ARITH_FPU_DIV_ACTIVE/ARITH_FPU_DIV_ACTIVE:EDGEDETECT | ||||
| - | ||||
| This performance group measures the average latency of divide operations | ||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user