Add likwid collector

2026-02-05 02:41:44 +01:00 · 2021-03-25 14:47:10 +01:00
parent 4fddcb9741
commit a6ac0c5373
670 changed files with 24926 additions and 0 deletions
--- a/collectors/likwid/groups/power8/BRANCH.txt
+++ b/collectors/likwid/groups/power8/BRANCH.txt
@@ -0,0 +1,30 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+PMC0  PM_BR_PRED_BR_CMPL
+PMC1  PM_BR_PRED_CCACHE_CMPL
+PMC2  PM_BR_PRED_CR_CMPL
+PMC3  PM_BR_MPRED_CMPL
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+Branch rate   (PMC0+PMC1+PMC2)/PMC4
+Branch misprediction rate  PMC3/PMC4
+Branch misprediction ratio  PMC4/(PMC0+PMC1+PMC2)
+Instructions per branch  PMC4/(PMC0+PMC1+PMC2)
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often in average a branch or a mispredicted branch occured
+per instruction retired in total. The Branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/Branch rate.
+
--- a/collectors/likwid/groups/power8/CPISTACK1.txt
+++ b/collectors/likwid/groups/power8/CPISTACK1.txt
@@ -0,0 +1,35 @@
+SHORT First level of IBM CPI stack 
+
+EVENTSET
+PMC0  PM_CMPLU_STALL_THRD 
+PMC1  PM_GCT_EMPTY_CYC
+PMC3  PM_CMPLU_STALL
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+CPI  PMC5/PMC4
+Stall cycles PMC3
+Stall cycle ratio PMC3/PMC5
+Thread blocked cycles PMC0
+Thread blocked cycle ratio PMC0/PMC5
+GCT empty cycles PMC1
+GCT empty cycle ratio PMC1/PM5
+
+
+
+
+LONG
+Formulas:
+Stall cycles = PM_CMPLU_STALL
+Stall cycle ratio = PM_CMPLU_STALL/PM_RUN_CYC
+Thread blocked cycles = PM_CMPLU_STALL_THRD
+Thread blocked cycle ratio = PM_CMPLU_STALL_THRD/PM_RUN_CYC
+GCT empty cycles = PM_GCT_EMPTY_CYC
+GCT empty cycle ratio = PM_GCT_EMPTY_CYC/PM_RUN_CYC
+--
+First level of IBM CPI stack. IBM names Stalled Cycles, Waiting to Complete,
+Thread Blocked, Completion Table Empty, Other and Completion Cycles. For some
+there are no clearly identifiable events, so this group concentrates on
+Stalled Cycles (PM_CMPLU_STALL), Thread Blocked (PM_CMPLU_STALL_THRD),
+Completion Table Empty (PM_GCT_EMPTY_CYC) and Other (PM_CMPLU_STALL_OTHER_CMPL).
--- a/collectors/likwid/groups/power8/DATA.txt
+++ b/collectors/likwid/groups/power8/DATA.txt
@@ -0,0 +1,23 @@
+SHORT Load to store ratio
+
+EVENTSET
+PMC0  PM_LD_CMPL
+PMC1  PM_ST_CMPL
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+Load to store ratio PMC0/PMC1
+Load ratio PMC0/PMC4
+Store ratio PMC1/PMC4
+
+LONG
+Formulas:
+Load to store ratio = PM_LD_CMPL/PM_ST_CMPL
+Load ratio = PM_LD_CMPL/PM_RUN_INST_CMPL
+Store ratio = PM_ST_CMPL/PM_RUN_INST_CMPL
+-
+This is a metric to determine your load to store ratio.
+
--- a/collectors/likwid/groups/power8/FLOPS_1_2.txt
+++ b/collectors/likwid/groups/power8/FLOPS_1_2.txt
@@ -0,0 +1,24 @@
+SHORT Group 121 as used in IBM Parallel Environment Developer Edition
+
+EVENTSET
+PMC0  PM_VSU0_1FLOP
+PMC1  PM_VSU1_1FLOP
+PMC2  PM_VSU0_2FLOP
+PMC3  PM_VSU1_2FLOP
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+CPI PMC5/PMC4
+One FLOP ops PMC0+PMC1
+Two FLOPs ops PMC2+PMC3
+[MFLOP/s]  1E-6*(PMC0+PMC1+((PMC2+PMC3)*2))/time
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+One FLOP ops = PM_VSU0_1FLOP+PM_VSU1_1FLOP
+Two FLOPs ops = PM_VSU0_2FLOP+PM_VSU1_2FLOP
+[MFLOP/s] = 1E-6*(PM_VSU0_1FLOP+PM_VSU1_1FLOP+((PM_VSU0_2FLOP+PM_VSU1_2FLOP)*2))/time
+--
+Group 121 from web page http://www.ibm.com/support/knowledgecenter/en/SSFK5S_2.2.0/com.ibm.cluster.pedev.v2r2.pedev100.doc/bl7ug_power8metrics.htm
--- a/collectors/likwid/groups/power8/FLOPS_4_8.txt
+++ b/collectors/likwid/groups/power8/FLOPS_4_8.txt
@@ -0,0 +1,24 @@
+SHORT Group 122 as used in IBM Parallel Environment Developer Edition
+
+EVENTSET
+PMC0  PM_VSU0_4FLOP
+PMC1  PM_VSU1_4FLOP
+PMC2  PM_VSU0_8FLOP
+PMC3  PM_VSU1_8FLOP
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+CPI PMC5/PMC4
+Four FLOPs ops PMC0+PMC1
+Eight FLOPs ops PMC2+PMC3
+MFLOP/s 1E-6*(((PMC0+PMC1)*4.0)+((PMC2+PMC3)*8.0))/time
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+Four FLOPs ops = PM_VSU0_4FLOP+PM_VSU1_4FLOP
+Eight FLOPs ops = PM_VSU0_8FLOP+PM_VSU1_8FLOP
+MFLOP/s = 1E-6*(((PM_VSU0_4FLOP+PM_VSU1_4FLOP)*4.0)+((PM_VSU0_8FLOP+PM_VSU1_8FLOP)*8.0))/time
+--
+Group 122 from web page http://www.ibm.com/support/knowledgecenter/en/SSFK5S_2.2.0/com.ibm.cluster.pedev.v2r2.pedev100.doc/bl7ug_power8metrics.htm
--- a/collectors/likwid/groups/power8/FLOPS_DP.txt
+++ b/collectors/likwid/groups/power8/FLOPS_DP.txt
@@ -0,0 +1,27 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0  PM_VSU0_DP_2FLOP
+PMC1  PM_VSU0_DP_FMA
+PMC2  PM_VSU0_DP_FSQRT_FDIV
+PMC3  PM_VSU0_SCALAR_DP_ISSUED
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+DP [MFLOP/s]  1.0E-06*((PMC0*2.0)+PMC2+(PMC1*4.0))/time
+DP VSX [MFLOP/s]  1.0E-06*((PMC1*4.0)+(PMC0*2.0))/time
+Packed [MUOPS/s]   1.0E-06*(PMC1)/time
+Scalar [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+DP [MFLOP/s] = 1.0E-06*(PM_VSU0_SCALAR_DP_ISSUED+PM_VSU1_SCALAR_DP_ISSUED+(PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime
+DP VSX [MFLOP/s] = 1.0E-06*((PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)/runtime
+Scalar [MUOPS/s] = 1.0E-06*(PM_VSU0_SCALAR_DP_ISSUED+PM_VSU1_SCALAR_DP_ISSUED)/runtime
+--
+
--- a/collectors/likwid/groups/power8/FLOPS_DP2.txt
+++ b/collectors/likwid/groups/power8/FLOPS_DP2.txt
@@ -0,0 +1,27 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0  PM_VSU1_DP_2FLOP
+PMC1  PM_VSU1_DP_FMA
+PMC2  PM_VSU1_DP_FSQRT_FDIV
+PMC3  PM_VSU1_SCALAR_DP_ISSUED
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+DP [MFLOP/s]  1.0E-06*(PMC0+PMC2+(PMC1)*4.0)/time
+DP VSX [MFLOP/s]  1.0E-06*((PMC1)*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC1)/time
+Scalar [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+DP [MFLOP/s] = 1.0E-06*(PM_VSU0_SCALAR_DP_ISSUED+PM_VSU1_SCALAR_DP_ISSUED+(PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime
+DP VSX [MFLOP/s] = 1.0E-06*((PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)/runtime
+Scalar [MUOPS/s] = 1.0E-06*(PM_VSU0_SCALAR_DP_ISSUED+PM_VSU1_SCALAR_DP_ISSUED)/runtime
+--
+
--- a/collectors/likwid/groups/power8/FLOPS_FMA.txt
+++ b/collectors/likwid/groups/power8/FLOPS_FMA.txt
@@ -0,0 +1,28 @@
+SHORT Group 124 as used in IBM Parallel Environment Developer Edition
+
+EVENTSET
+PMC0  PM_VSU0_DP_FMA
+PMC1  PM_VSU1_DP_FMA
+PMC2  PM_VSU0_FMA
+PMC3  PM_VSU1_FMA
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+CPI PMC5/PMC4
+DP FMAs PMC0+PMC1
+Scalar FMAs PMC2+PMC3
+DP FMA [MFLOP/s] 1E-6*(PMC0+PMC1)*4.0/time
+Scalar FMA [MFLOP/s] 1E-6*(PMC2+PMC3)*2.0/time
+[MFLOP/s] 1E-6*(((PMC0+PMC1)*4.0)+((PMC2+PMC3)*2.0))/time
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+DP FMAs = PM_VSU0_DP_FMA+PM_VSU1_DP_FMA
+Scalar FMAs = PM_VSU0_FMA+PM_VSU1_FMA
+DP FMA [MFLOP/s] = 1E-6*(PM_VSU0_DP_FMA+PM_VSU1_DP_FMA)*4.0/runtime
+Scalar FMA [MFLOP/s] = 1E-6*(PM_VSU0_FMA+PM_VSU1_FMA)*2.0/runtime
+[MFLOP/s] = 1E-6*(((PM_VSU0_DP_FMA+PM_VSU1_DP_FMA)*4.0)+((PM_VSU0_FMA+PM_VSU1_FMA)*2.0))/runtime
+--
+Group 124 from web page http://www.ibm.com/support/knowledgecenter/en/SSFK5S_2.2.0/com.ibm.cluster.pedev.v2r2.pedev100.doc/bl7ug_power8metrics.htm
--- a/collectors/likwid/groups/power8/FLOPS_SP.txt
+++ b/collectors/likwid/groups/power8/FLOPS_SP.txt
@@ -0,0 +1,27 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0  PM_VSU0_SINGLE
+PMC1  PM_VSU0_VECTOR_SP_ISSUED
+PMC2  PM_VSU1_SINGLE
+PMC3  PM_VSU1_VECTOR_SP_ISSUED
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+SP [MFLOP/s]  1.0E-06*(((PMC0-PMC1)+(PMC2-PMC3))*4.0+(PMC1+PMC3)*8.0)/time
+SP VSX [MFLOP/s]  1.0E-06*((PMC1+PMC3)*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC1+PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+SP [MFLOP/s] = 1.0E-06*(PM_VSU0_SINGLE+PM_VSU1_SINGLE+(PM_VSU0_VECTOR_SP_ISSUED+PM_VSU1_VECTOR_SP_ISSUED)*8)/runtime
+SP VSX [MFLOP/s] = 1.0E-06*((PM_VSU0_VECTOR_SP_ISSUED+PM_VSU1_VECTOR_SP_ISSUED)*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(PM_VSU0_VECTOR_SP_ISSUED+PM_VSU1_VECTOR_SP_ISSUED)/runtime
+Scalar [MUOPS/s] = 1.0E-06*(PM_VSU0_SINGLE+PM_VSU1_SINGLE)/runtime
+--
+
--- a/collectors/likwid/groups/power8/FLOPS_VSU0.txt
+++ b/collectors/likwid/groups/power8/FLOPS_VSU0.txt
@@ -0,0 +1,23 @@
+SHORT Double Precision MFlops/s performed by VSU pipe 0
+
+EVENTSET
+PMC0  PM_VSU0_DP_2FLOP
+PMC1  PM_VSU0_DP_FMA
+PMC2  PM_VSU0_DP_FSQRT_FDIV
+PMC3  PM_VSU0_1FLOP
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+[MFLOP/s]  1.0E-06*((PMC0*2.0)+(PMC2*8.0)+(PMC1*4.0)+PMC3)/time
+VSX [MFLOP/s]  1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+[MFLOP/s] = 1.0E-06*(PM_VSU0_SCALAR_DP_ISSUED+PM_VSU0_SCALAR_DP_ISSUED+(PM_VSU0_VECTOR_DP_ISSUED+PM_VSU0_VECTOR_DP_ISSUED)*4)/runtime
+VSX [MFLOP/s] = 1.0E-06*((PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime
+--
+
--- a/collectors/likwid/groups/power8/FLOPS_VSU1.txt
+++ b/collectors/likwid/groups/power8/FLOPS_VSU1.txt
@@ -0,0 +1,22 @@
+SHORT Double Precision MFlops/s performed by VSU pipe 1
+
+EVENTSET
+PMC0  PM_VSU1_DP_2FLOP
+PMC1  PM_VSU1_DP_FMA
+PMC2  PM_VSU1_DP_FSQRT_FDIV
+PMC3  PM_VSU1_1FLOP
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+[MFLOP/s]  1.0E-06*((PMC0*2.0)+(PMC2*8.0)+(PMC1*4.0)+PMC3)/time
+VSX [MFLOP/s]  1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+[MFLOP/s] = 1.0E-06*(PM_VSU1_SCALAR_DP_ISSUED+PM_VSU1_SCALAR_DP_ISSUED+(PM_VSU1_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime
+VSX [MFLOP/s] = 1.0E-06*((PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime
+--
--- a/collectors/likwid/groups/power8/FLOPS_VSX.txt
+++ b/collectors/likwid/groups/power8/FLOPS_VSX.txt
@@ -0,0 +1,29 @@
+SHORT Vectorized MFlops/s
+
+EVENTSET
+PMC0  PM_VSU0_VECTOR_DP_ISSUED
+PMC1  PM_VSU1_VECTOR_DP_ISSUED
+PMC2  PM_VSU0_VECTOR_SP_ISSUED
+PMC3  PM_VSU1_VECTOR_SP_ISSUED
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+[MFLOP/s]  1.0E-06*((PMC0+PMC1)*4.0+(PMC2+PMC3)*8.0)/time
+DP [MFLOP/s]  1.0E-06*((PMC0+PMC1)*4.0)/time
+SP [MFLOP/s]  1.0E-06*((PMC2+PMC3)*8.0)/time
+DP [MUOPS/s]   1.0E-06*(PMC0+PMC1)/time
+SP [MUOPS/s]   1.0E-06*(PMC2+PMC3)/time
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+[MFLOP/s] = 1.0E-06*((PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4.0+(PM_VSU0_VECTOR_SP_ISSUED+PM_VSU1_VECTOR_SP_ISSUED)*8.0)/runtime
+DP [MFLOP/s] = 1.0E-06*((PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4.0)/runtime
+SP [MFLOP/s] = 1.0E-06*((PM_VSU0_VECTOR_SP_ISSUED+PM_VSU1_VECTOR_SP_ISSUED)*8.0)/runtime
+DP [MUOPS/s] = 1.0E-06*(PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)/runtime
+SP [MUOPS/s] = 1.0E-06*(PM_VSU0_VECTOR_SP_ISSUED+PM_VSU1_VECTOR_SP_ISSUED)/runtime
+--
+
--- a/collectors/likwid/groups/power8/ICACHE.txt
+++ b/collectors/likwid/groups/power8/ICACHE.txt
@@ -0,0 +1,22 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+PMC0  PM_INST_FROM_L1
+PMC1  PM_L1_ICACHE_MISS
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L1I request rate PMC0/PMC4
+L1I miss rate PMC1/PMC4
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+-
+This group measures some L1 instruction cache metrics.
--- a/collectors/likwid/groups/power8/L1.txt
+++ b/collectors/likwid/groups/power8/L1.txt
@@ -0,0 +1,33 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  PM_LD_REF_L1 
+PMC1  PM_ST_CMPL
+PMC2  PM_LSU_L1_PREF
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L2D load bandwidth [MBytes/s]  1.0E-06*((PMC0+PMC2)/2)*64.0/time
+L2D load data volume [GBytes]  1.0E-09*((PMC0+PMC2)/2)*64.0
+L2D store bandwidth [MBytes/s]  1.0E-06*((PMC1/2))*64.0/time
+L2D store data volume [GBytes]  1.0E-09*((PMC1/2))*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*((PMC1+PMC0+PMC2)/2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*((PMC1+PMC0+PMC2)/2)*64.0
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+L2D load bandwidth [MBytes/s] = 1.0E-06*(PM_DATA_FROM_L2/2)*128.0/time
+L2D load data volume [GBytes] = 1.0E-09*(PM_DATA_FROM_L2/2)*128.0
+L2D store bandwidth [MBytes/s] = 1.0E-06*(PM_ST_MISS_L1)*128.0/time
+L2D store data volume [GBytes] = 1.0E-09*(PM_ST_MISS_L1)*128.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(PM_DATA_FROM_L2/2 + PM_ST_MISS_L1)*128.0/time
+L2 data volume [GBytes] = 1.0E-09*(PM_DATA_FROM_L2/2 + PM_ST_MISS_L1)*128.0
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cacheline loaded from the L2 to the L1 data cache. There is currently no
+event to get the evicted data volume.
--- a/collectors/likwid/groups/power8/L2.txt
+++ b/collectors/likwid/groups/power8/L2.txt
@@ -0,0 +1,32 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  PM_L2_ST 
+PMC2  PM_LD_MISS_L1
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L2D load bandwidth [MBytes/s]  1.0E-06*(PMC2/2)*128.0/time
+L2D load data volume [GBytes]  1.0E-09*(PMC2/2)*128.0
+L2D store bandwidth [MBytes/s]  1.0E-06*(PMC0/2)*128.0/time
+L2D store data volume [GBytes]  1.0E-09*(PMC0/2)*128.0
+L2 bandwidth [MBytes/s] 1.0E-06*((PMC0+PMC2)/2)*128.0/time
+L2 data volume [GBytes] 1.0E-09*((PMC0+PMC2)/2)*128.0
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+L2D load bandwidth [MBytes/s] = 1.0E-06*(PM_DATA_FROM_L2/2)*128.0/time
+L2D load data volume [GBytes] = 1.0E-09*(PM_DATA_FROM_L2/2)*128.0
+L2D store bandwidth [MBytes/s] = 1.0E-06*(PM_ST_CMPL/2)*128.0/time
+L2D store data volume [GBytes] = 1.0E-09*(PM_ST_CMPL/2)*128.0
+L2 bandwidth [MBytes/s] = 1.0E-06*((PM_DATA_FROM_L2 + PM_ST_CMPL))*128.0/time
+L2 data volume [GBytes] = 1.0E-09*((PM_DATA_FROM_L2 + PM_ST_CMPL))*128.0
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cacheline loaded from the L2 to the L1 data cache. There is currently no
+event to get the evicted data volume.
--- a/collectors/likwid/groups/power8/L2CACHE.txt
+++ b/collectors/likwid/groups/power8/L2CACHE.txt
@@ -0,0 +1,40 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+PMC0  PM_L2_ST_MISS
+PMC1  PM_L2_LD_MISS
+PMC2  PM_L2_LD_DISP
+PMC3  PM_L2_ST_DISP
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L2 request rate = (PMC2+PMC3)/PMC4
+L2 miss rate = (PMC0+PMC1)/PMC4
+L2 miss ratio = (PMC0+PMC1)/(PMC2+PMC3)
+
+LONG
+Formulas:
+L2 request rate = (PM_L2_LD_DISP+PM_L2_ST_DISP)/PM_RUN_INST_CMPL
+L2 miss rate = (PM_L2_LD_MISS+PM_L2_ST_MISS)/PM_RUN_INST_CMPL
+L2 miss ratio = (PM_L2_LD_MISS+PM_L2_ST_MISS)/(PM_L2_LD_DISP+PM_L2_ST_DISP)
+L2 load request rate = PM_L2_LD_DISP/PM_RUN_INST_CMPL
+L2 store request rate = PM_L2_ST_DISP/PM_RUN_INST_CMPL
+L2 load miss rate = PM_L2_LD_MISS/PM_RUN_INST_CMPL
+L2 store miss rate = PM_L2_ST_DISP/PM_RUN_INST_CMPL
+L2 load miss ratio = PM_L2_LD_MISS/(PM_L2_LD_DISP+PM_L2_ST_DISP)
+L2 store miss ratio = PM_L2_ST_MISS/(PM_L2_LD_DISP+PM_L2_ST_DISP)
+-
+This group measures the locality of your data accesses with regard to the
+L2 Cache. L2 request rate tells you how data intensive your code is
+or how many Data accesses you have in average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cachelines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cacheline to be loaded from a higher level.
+While the Data cache miss rate might be given by your algorithm you should
+try to get Data cache miss ratio as low as possible by increasing your cache reuse.
+
+
--- a/collectors/likwid/groups/power8/L3.txt
+++ b/collectors/likwid/groups/power8/L3.txt
@@ -0,0 +1,31 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  PM_L3_LD_PREF
+PMC3  PM_DATA_FROM_L3
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L3D load bandwidth [MBytes/s]  1.0E-06*(PMC3+(PMC0-PMC3))*128.0/time
+L3D load data volume [GBytes]  1.0E-09*(PMC3+(PMC0-PMC3))*128.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC3+(PMC0-PMC3))*128.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC3+(PMC0-PMC3))*128.0
+Loads from local L3 per cycle 100.0*(PMC3+(PMC0-PMC3))/PMC5
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+L3D load bandwidth [MBytes/s] = 1.0E-06*(PM_DATA_FROM_L3)*128.0/time
+L3D load data volume [GBytes] = 1.0E-09*(PM_DATA_FROM_L3)*128.0
+L3D evict bandwidth [MBytes/s] = 1.0E-06*(PM_L2_CASTOUT_MOD)*128.0/time
+L3D evict data volume [GBytes] = 1.0E-09*(PM_L2_CASTOUT_MOD)*128.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(PM_DATA_FROM_L3+PM_L2_CASTOUT_MOD)*128.0/time
+L3 data volume [GBytes] = 1.0E-09*(PM_DATA_FROM_L3+PM_L2_CASTOUT_MOD)*128.0
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cacheline loaded from the L3 to the L2 data cache. There is currently no
+event to get the evicted data volume.
--- a/collectors/likwid/groups/power8/MEM.txt
+++ b/collectors/likwid/groups/power8/MEM.txt
@@ -0,0 +1,30 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+PMC0  PM_L3_CO_MEPF
+PMC1  PM_DATA_ALL_FROM_MEMORY
+PMC3  PM_L3_PF_ON_CHIP_MEM
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+Memory load bandwidth [MBytes/s] 1.0E-06*(PMC1+PMC3)*128.0/time
+Memory load data volume [GBytes] 1.0E-09*(PMC1+PMC3)*128.0
+Memory evict bandwidth [MBytes/s] 1.0E-06*(PMC0)*128.0/time
+Memory evict data volume [GBytes] 1.0E-09*(PMC0)*128.0
+Memory bandwidth [MBytes/s] 1.0E-06*(PMC1+PMC3+PMC0)*128.0/time
+Memory data volume [GBytes] 1.0E-09*(PMC1+PMC3+PMC0)*128.0
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC / PM_RUN_INST_CMPL
+Memory load bandwidth [MBytes/s] = 1.0E-06* (PM_DATA_ALL_FROM_MEMORY)*128/time
+Memory load data volume [GBytes] = 1.0E-09* (PM_DATA_ALL_FROM_MEMORY)*128
+Memory evict bandwidth [MBytes/s] = 1.0E-06* (PM_MEM_CO)*128/time
+Memory evict data volume [GBytes] = 1.0E-09* (PM_MEM_CO)*128
+Memory bandwidth [MBytes/s] = 1.0E-06* (PM_DATA_ALL_FROM_MEMORY+PM_MEM_CO)*128/time
+Memory data volume [GBytes] = 1.0E-09* (PM_DATA_ALL_FROM_MEMORY+PM_MEM_CO)*128
+--
+This group uses the core-local events to measure data traffic from memory.
--- a/collectors/likwid/groups/power8/NUMA.txt
+++ b/collectors/likwid/groups/power8/NUMA.txt
@@ -0,0 +1,29 @@
+SHORT Memory bandwidth in MBytes/s for local and remote memory
+
+EVENTSET
+PMC1  PM_DATA_ALL_FROM_LMEM
+PMC3  PM_DATA_ALL_FROM_DMEM
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+Local bandwidth [MBytes/s] 1.0E-06*(PMC1)*128.0/time
+Local data volume [GBytes] 1.0E-09*(PMC1)*128.0
+Remote bandwidth [MBytes/s] 1.0E-06*(PMC3)*128.0/time
+Remote data volume [GBytes] 1.0E-09*(PMC3)*128.0
+Memory load bandwidth [MBytes/s] 1.0E-06*(PMC1+PMC3)*128.0/time
+Memory load data volume [GBytes] 1.0E-09*(PMC1+PMC3)*128.0
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC / PM_RUN_INST_CMPL
+Memory load bandwidth [MBytes/s] = 1.0E-06* (PM_DATA_ALL_FROM_MEMORY)*128/time
+Memory load data volume [GBytes] = 1.0E-09* (PM_DATA_ALL_FROM_MEMORY)*128
+Memory evict bandwidth [MBytes/s] = 1.0E-06* (PM_MEM_CO)*128/time
+Memory evict data volume [GBytes] = 1.0E-09* (PM_MEM_CO)*128
+Memory bandwidth [MBytes/s] = 1.0E-06* (PM_DATA_ALL_FROM_MEMORY+PM_MEM_CO)*128/time
+Memory data volume [GBytes] = 1.0E-09* (PM_DATA_ALL_FROM_MEMORY+PM_MEM_CO)*128
+--
+This group measures the NUMA traffic by separating local from remote memory data transfers.
--- a/collectors/likwid/groups/power8/STALLS1.txt
+++ b/collectors/likwid/groups/power8/STALLS1.txt
@@ -0,0 +1,33 @@
+SHORT Completion stalls (group 1)
+
+EVENTSET
+PMC0 PM_CMPLU_STALL_THRD
+PMC1 PM_CMPLU_STALL_DCACHE_MISS
+PMC2 PM_CMPLU_STALL_COQ_FULL
+PMC3 PM_CMPLU_STALL
+PMC4 PM_RUN_INST_CMPL
+PMC5 PM_RUN_CYC
+
+METRICS
+Runtime time
+CPI  PMC5/PMC4
+Completion stall cycles PMC3
+Stall cycles by thread conflict PMC0
+Stall ratio by thread conflict [%] PMC0/PMC3*100.0
+Stall cycles by d-cache miss PMC1
+Stall ratio by d-cache miss [%] PMC1/PMC3*100.0
+Stall cycles by full castout queue PMC2
+Stall ratio by full castout queue [%] PMC2/PMC3*100.0
+
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC / PM_RUN_INST_CMPL
+Completion stall cycles = PM_CMPLU_STALL
+Stall cycles by thread conflict = PM_CMPLU_STALL_THRD
+Stall ratio by thread conflict [%] = PM_CMPLU_STALL_THRD/PM_CMPLU_STALL*100
+Stall cycles by d-cache miss = PM_CMPLU_STALL_DCACHE_MISS
+Stall ratio by d-cache miss [%] = PM_CMPLU_STALL_DCACHE_MISS/PM_CMPLU_STALL*100
+Stall cycles by full castout queue = PM_CMPLU_STALL_COQ_FULL
+Stall ratio by full castout queue [%] = PM_CMPLU_STALL_COQ_FULL/PM_CMPLU_STALL*100
+--
--- a/collectors/likwid/groups/power8/STALLS2.txt
+++ b/collectors/likwid/groups/power8/STALLS2.txt
@@ -0,0 +1,32 @@
+SHORT Completion stalls (group 2)
+
+EVENTSET
+PMC0 PM_CMPLU_STALL
+PMC1 PM_CMPLU_STALL_LSU
+PMC2 PM_CMPLU_STALL_FLUSH
+PMC3 PM_CMPLU_STALL_BRU
+PMC4 PM_RUN_INST_CMPL
+PMC5 PM_RUN_CYC
+
+METRICS
+CPI  PMC5/PMC4
+Stall cycles PMC0
+Stall cycles by load/store unit PMC1
+Stall ratio by load/store unit [%] PMC1/PMC0*100.0
+Stall cycles by pipeline flush PMC2
+Stall ratio by pipeline flush [%] PMC2/PMC0*100.0
+Stall cycles by branch unit PMC3
+Stall ratio by branch unit [%] PMC3/PMC0*100.0
+
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC / PM_RUN_INST_CMPL
+Stall cycles = PM_CMPLU_STALL
+Stall cycles by load/store unit = PM_CMPLU_STALL_LSU
+Stall ratio by load/store unit [%] = PM_CMPLU_STALL_LSU/PM_CMPLU_STALL*100.0
+Stall cycles by pipeline flush = PM_CMPLU_STALL_FLUSH
+Stall ratio by pipeline flush [%] = PM_CMPLU_STALL_FLUSH/PM_CMPLU_STALL*100.0
+Stall cycles by branch unit = PM_CMPLU_STALL_BRU
+Stall ratio by branch unit [%] = PM_CMPLU_STALL_BRU/PM_CMPLU_STALL*100.0
+--
--- a/collectors/likwid/groups/power8/TLB_DATA.txt
+++ b/collectors/likwid/groups/power8/TLB_DATA.txt
@@ -0,0 +1,37 @@
+SHORT  L1 Data TLB miss rate/ratio
+
+EVENTSET
+PMC0  PM_DTLB_MISS_16G
+PMC1  PM_DTLB_MISS_4K
+PMC2  PM_DTLB_MISS_64K
+PMC3  PM_DTLB_MISS_16M
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L1 DTLB 4K misses     PMC1
+L1 DTLB 4K miss rate  PMC1/PMC4
+L1 DTLB 64K misses     PMC2
+L1 DTLB 64K miss rate  PMC2/PMC4
+L1 DTLB 16M misses     PMC3
+L1 DTLB 16M miss rate  PMC3/PMC4
+L1 DTLB 16G misses     PMC0
+L1 DTLB 16G miss rate  PMC0/PMC4
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC / PM_RUN_INST_CMPL
+L1 DTLB 4K misses = PM_DTLB_MISS_4K
+L1 DTLB 4K miss rate = PM_DTLB_MISS_4K/PM_RUN_INST_CMPL
+L1 DTLB 64K misses = PM_DTLB_MISS_64K
+L1 DTLB 64K miss rate = PM_DTLB_MISS_64K/PM_RUN_INST_CMPL
+L1 DTLB 16M misses = PM_DTLB_MISS_16M
+L1 DTLB 16M miss rate = PM_DTLB_MISS_16M/PM_RUN_INST_CMPL
+L1 DTLB 16G misses = PM_DTLB_MISS_16G
+L1 DTLB 16G miss rate = PM_DTLB_MISS_16G/PM_RUN_INST_CMPL
+--
+The DTLB load and store miss rates gives a measure how often a TLB miss occured
+per instruction.
+
--- a/collectors/likwid/groups/power8/TLB_INSTR.txt
+++ b/collectors/likwid/groups/power8/TLB_INSTR.txt
@@ -0,0 +1,21 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+PMC2  PM_ITLB_MISS
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L1 ITLB misses     PMC2
+L1 ITLB miss rate  PMC2/PMC4
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC / PM_RUN_INST_CMPL
+L1 ITLB misses = PM_ITLB_MISS
+L1 ITLB miss rate = PM_ITLB_MISS/PM_RUN_INST_CMPL
+--
+The ITLB miss rate gives a measure how often a TLB miss occured per instruction.
+
--- a/collectors/likwid/groups/power8/USEFUL.txt
+++ b/collectors/likwid/groups/power8/USEFUL.txt
@@ -0,0 +1,24 @@
+SHORT Rate of useful instructions
+
+EVENTSET
+PMC0  PM_IOPS_CMPL
+PMC1  PM_INST_DISP
+PMC2  PM_IOPS_DISP
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+CPI  PMC5/PMC4
+Useful instr. rate PMC4/PMC1*100.0
+Useful uops rate PMC0/PMC2*100.0
+
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC / PM_RUN_INST_CMPL
+Useful instr. rate = PM_RUN_INST_CMPL/PM_INST_DISP*100.0
+Useful uops rate = PM_IOPS_CMPL/PM_IOPS_DISP*100.0
+--
+This groups measures how many of the dispatched instructions and internal operations (uops) are
+acutally completed. These metrics show the speculatively dispatches instructions compared to the
+completed instructions.