diff --git a/clusterdaemon.go b/clusterdaemon.go
deleted file mode 100644
index afe82b8..0000000
--- a/clusterdaemon.go
+++ /dev/null
@@ -1,320 +0,0 @@
-package main
-
-import (
-	"fmt"
-	"os"
-	"os/exec"
-
-	//"bytes"
-	//    "context"
-	"encoding/json"
-	"path/filepath"
-
-	//"sort"
-	"errors"
-	"strings"
-	"time"
-
-	protocol "github.com/influxdata/line-protocol"
-)
-
-type GlobalConfig struct {
-	Sink struct {
-		User     string `json:"user"`
-		Password string `json:"password"`
-	} `json:"sink"`
-	Host   string `json:"host"`
-	Port   string `json:"port"`
-	Report struct {
-		Levels   string `json:"levels"`
-		Interval int    `json:"interval"`
-	} `json:"report"`
-	Schedule struct {
-		Core struct {
-			Frequency int `json:"frequency"`
-			Duration  int `json:"duration"`
-		} `json:"core"`
-		Node struct {
-			Frequency int `json:"frequency"`
-			Duration  int `json:"duration"`
-		} `json:"node"`
-	} `json:"schedule"`
-	Metrics       []string `json:"metrics"`
-	CollectorPath string   `json:"collector_path"`
-}
-
-type CollectorConfig struct {
-	Command  string   `json:"command"`
-	Args     string   `json:"arguments"`
-	Provides []string `json:"provides"`
-}
-
-type InternalCollectorConfig struct {
-	Config   CollectorConfig
-	Location string
-	LastRun  time.Time
-	encoder  *protocol.Encoder
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Load global configuration from JSON file
-//////////////////////////////////////////////////////////////////////////////
-func LoadGlobalConfiguration(file string, config *GlobalConfig) error {
-	configFile, err := os.Open(file)
-	defer configFile.Close()
-	if err != nil {
-		return err
-	}
-	jsonParser := json.NewDecoder(configFile)
-	jsonParser.Decode(config)
-	return err
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Load collector configuration from JSON file
-//////////////////////////////////////////////////////////////////////////////
-func LoadCollectorConfiguration(file string, config *CollectorConfig) error {
-	configFile, err := os.Open(file)
-	defer configFile.Close()
-	if err != nil {
-		return err
-	}
-	jsonParser := json.NewDecoder(configFile)
-	jsonParser.Decode(config)
-	return err
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Load collector configurations
-//////////////////////////////////////////////////////////////////////////////
-func GetSingleCollector(folders *[]string) filepath.WalkFunc {
-	return func(path string, info os.FileInfo, err error) error {
-		if info.IsDir() {
-			configfile := filepath.Join(path, "config.json")
-			if _, err := os.Stat(configfile); err == nil {
-				// TODO: Validate config?
-				p, err := filepath.Abs(path)
-				if err == nil {
-					*folders = append(*folders, p)
-				}
-			}
-		}
-		return nil
-	}
-}
-
-func GetCollectorFolders(root string, folders *[]string) error {
-	err := filepath.Walk(root, GetSingleCollector(folders))
-	if err != nil {
-		err = errors.New("Cannot get collectors")
-	}
-	return err
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Setup all collectors
-//////////////////////////////////////////////////////////////////////////////
-func SetupCollectors(config GlobalConfig) ([]InternalCollectorConfig, error) {
-	var folders []string
-	var outconfig []InternalCollectorConfig
-	//encoder := protocol.NewEncoder(buf)
-	//encoder.SetMaxLineBytes(1024)
-	GetCollectorFolders(config.CollectorPath, &folders)
-	for _, path := range folders {
-		var col_config InternalCollectorConfig
-		LoadCollectorConfiguration(filepath.Join(path, "config.json"), &col_config.Config)
-		col_config.LastRun = time.Now()
-		col_config.Location = path
-		//buf := &bytes.Buffer{}
-		//col_config.Encoder := protocol.NewEncoder(buf)
-		//col_config.Encoder.SetMaxLineBytes(1024)
-		outconfig = append(outconfig, col_config)
-	}
-	return outconfig, nil
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Run collector
-//////////////////////////////////////////////////////////////////////////////
-func RunCollector(config InternalCollectorConfig) ([]string, error) {
-	var results []string
-	var err error
-	cmd := config.Config.Command
-
-	if _, err = os.Stat(cmd); err != nil {
-		//fmt.Println(err.Error())
-		if !strings.HasPrefix(cmd, "/") {
-			cmd = filepath.Join(config.Location, config.Config.Command)
-			if _, err = os.Stat(cmd); err != nil {
-				//fmt.Println(err.Error())
-				cmd, err = exec.LookPath(config.Config.Command)
-			}
-		}
-	}
-	if err != nil {
-		fmt.Println(err.Error())
-		return results, err
-	}
-
-	// TODO: Add timeout
-
-	command := exec.Command(cmd, config.Config.Args)
-	command.Dir = config.Location
-	command.Wait()
-	stdout, err := command.Output()
-	if err != nil {
-		//log.error(err.Error())
-		fmt.Println(err.Error())
-		return results, err
-	}
-
-	lines := strings.Split(string(stdout), "\n")
-
-	for _, l := range lines {
-		if strings.HasPrefix(l, "#") {
-			continue
-		}
-		results = append(results, l)
-	}
-	return results, err
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Setup sink
-//////////////////////////////////////////////////////////////////////////////
-func SetupSink(config GlobalConfig) chan string {
-
-	c := make(chan string, 300)
-
-	// TODO: Setup something for sending? Establish HTTP connection?
-	return c
-}
-
-func RunSink(config GlobalConfig, queue *chan string) (*time.Ticker, chan bool) {
-
-	interval := time.Duration(config.Report.Interval) * time.Second
-	ticker := time.NewTicker(interval)
-	done := make(chan bool)
-
-	go func() {
-		for {
-			select {
-			case <-done:
-				return
-			case t := <-ticker.C:
-				fmt.Println("SinkTick at", t)
-				empty := false
-				var batch []string
-				for empty == false {
-					select {
-					case metric := <-*queue:
-						fmt.Println(metric)
-						batch = append(batch, metric)
-					default:
-						// No metric available, wait for the next iteration
-						empty = true
-						break
-					}
-				}
-				for _, m := range batch {
-					fmt.Println(m)
-				}
-			}
-		}
-	}()
-	return ticker, done
-}
-
-func CloseSink(config GlobalConfig, queue *chan string, ticker *time.Ticker, done chan bool) {
-	ticker.Stop()
-	done <- true
-	close(*queue)
-}
-
-func MainLoop(config GlobalConfig, sink *chan string) (*time.Ticker, chan bool) {
-	var intConfig []InternalCollectorConfig
-	intConfig, err := SetupCollectors(config)
-	if err != nil {
-		panic(err)
-	}
-
-	interval := time.Duration(config.Schedule.Node.Frequency) * time.Second
-	ticker := time.NewTicker(time.Second)
-	done := make(chan bool)
-
-	go func() {
-		for {
-			select {
-			case <-done:
-				return
-			case t := <-ticker.C:
-				fmt.Println("CollectorTick at", t)
-				unix := time.Now()
-				for i, _ := range intConfig {
-					if time.Duration(unix.Sub(intConfig[i].LastRun)) > interval {
-						res, err := RunCollector(intConfig[i])
-						if err != nil {
-							//log.error("Collector failed: ", err.Error())
-						} else {
-							//TODO: parse and skip in case of error, encode to []string
-							for _, r := range res {
-								if len(r) > 0 {
-									*sink <- r
-								}
-							}
-						}
-						intConfig[i].LastRun = time.Now()
-					}
-				}
-			}
-		}
-	}()
-	return ticker, done
-}
-
-func main() {
-	//    fmt.Println("Hello")
-	//    cmd_opts := []string{"la","le","lu"}
-	//    cmd := "echo"
-	//    s := run_cmd(cmd, cmd_opts)
-	//    fmt.Println(s)
-	//    tags := map[string]string {
-	//        "host" : "broadep2",
-	//    }
-	//    fields := map[string]interface{} {
-	//        "value" : float64(1.0),
-	//    }
-	//    fmt.Println(CreatePoint("flops_any", tags, fields, time.Now().UnixNano()))
-	var config GlobalConfig
-	LoadGlobalConfiguration("config.json", &config)
-
-	queue := SetupSink(config)
-	sinkTicker, sinkDone := RunSink(config, &queue)
-	collectTicker, collectDone := MainLoop(config, &queue)
-	time.Sleep(1600 * time.Second)
-	collectTicker.Stop()
-	collectDone <- true
-	CloseSink(config, &queue, sinkTicker, sinkDone)
-
-	//	var folders []string
-	//	GetCollectorFolders(config.CollectorPath, &folders)
-
-	//	for _, path := range folders {
-	//		var col_config CollectorConfig
-	//		LoadCollectorConfiguration(filepath.Join(path, "config.json"), &col_config)
-	//		stdout := run_cmd(filepath.Join(path, col_config.Command), col_config.Args)
-
-	//		metrics := strings.Split(stdout, "\n")
-	//		for _, m := range metrics {
-	//			if len(m) > 0 {
-	//				t := strings.Fields(m)
-	//				if len(t) == 2 {
-	//					var s strings.Builder
-	//					fmt.Fprintf(&s, "%s %d", m, time.Now().UnixNano())
-	//					m = s.String()
-	//				}
-	//				fmt.Println("SEND", m)
-	//			}
-	//		}
-	//	}
-}
diff --git a/clusterdaemon_simple.go b/clusterdaemon_simple.go
deleted file mode 100644
index 66964e8..0000000
--- a/clusterdaemon_simple.go
+++ /dev/null
@@ -1,190 +0,0 @@
-package main
-
-import (
-    "fmt"
-    "strings"
-    "io/ioutil"
-    "os"
-    "os/signal"
-    "strconv"
-    "time"
-)
-
-
-// geht nicht
-//enum CollectScope {
-//    Node: 0,
-//    Socket,
-//    Die,
-//    LLC,
-//    NUMA,
-//    Core,
-//    HWThread
-//}
-
-//var scopeNames = map[CollectScope]string{
-//    Node: "Node",
-//    Socket: "Socket",
-//    Die: "Die",
-//    LLC: "LLC",
-//    NUMA: "NUMA",
-//    Core: "Core",
-//    HWThread: "HWThread"
-//}
-
-type CollectValue struct {
-    Name string
-    Value interface{}
-    //scope CollectScope
-}
-
-type InitFunc func() error
-type ReadFunc func(time.Duration) ([]CollectValue, error)
-type CloseFunc func() error
-type SinkFunc func([]CollectValue) error
-
-func read_memavg(duration time.Duration) ([]CollectValue, error) {
-    var values []CollectValue
-    data, err := ioutil.ReadFile("/proc/meminfo")
-    if err != nil {
-        fmt.Println(err.Error())
-        return values, err
-    }
-    var matches = map[string]string {
-        "MemTotal" : "mem_total",
-        "MemAvailable" : "mem_avail",
-        "MemFree" : "mem_free",
-    }
-    lines := strings.Split(string(data), "\n")
-    for _, l := range lines {
-        for i,o := range matches {
-            if strings.HasPrefix(l, i) {
-                f := strings.Fields(l)
-                v, err := strconv.ParseInt(f[1], 10, 0)
-                if err == nil {
-                    var value CollectValue
-    //                value.Scope = Node
-                    value.Name = o
-                    value.Value = v
-                    values = append(values, value)
-                }
-            }
-        }
-    }
-    return values, nil
-}
-
-func read_loadavg(duration time.Duration) ([]CollectValue, error) {
-    var values []CollectValue
-    data, err := ioutil.ReadFile("/proc/loadavg")
-    if err != nil {
-        fmt.Println(err.Error())
-        return values, err
-    }
-    var matches = map[int]string {
-        0 : "loadavg1m",
-        1 : "loadavg5m",
-        2 : "loadavg15m",
-    }
-    f := strings.Fields(string(data))
-    for i, m := range matches {
-        v, err := strconv.ParseFloat(f[i], 64)
-        if err == nil {
-            var value CollectValue
-            value.Name = m
-            value.Value = v
-    //        value.Scope = Node
-            values = append(values, value)
-        }
-    }
-    return values, nil
-}
-
-func read_netstat(duration time.Duration) ([]CollectValue, error) {
-    var values []CollectValue
-    data, err := ioutil.ReadFile("/proc/net/dev")
-    if err != nil {
-        fmt.Println(err.Error())
-        return values, err
-    }
-    var matches = map[int]string {
-        1 : "bytes_in",
-        9 : "bytes_out",
-        2 : "pkts_in",
-        10 : "pkts_out",
-    }
-    lines := strings.Split(string(data), "\n")
-    for _, l := range lines {
-        if ! strings.Contains(l, ":") {
-            continue
-        }
-        f := strings.Fields(l)
-        dev := f[0][0:len(f[0])-1]
-        if dev == "lo" {
-            continue
-        }
-        for i, m := range matches {
-            v, err := strconv.ParseInt(f[i], 10, 0)
-            if err == nil {
-                var value CollectValue
-                value.Name = fmt.Sprintf("%s_%s", dev, m)
-                value.Value = v
-                //value.Scope = Node
-                values = append(values, value)
-            }
-        }
-    }
-    return values, nil
-}
-
-func Send(values []CollectValue) error {
-    for _, v := range values {
-        fmt.Printf("Name: '%s' Value: '%v'\n", v.Name, v.Value)
-    }
-    return nil
-}
-
-func ReadAll(duration time.Duration, reads []ReadFunc, sink SinkFunc) {
-    for _, f := range reads {
-        values, err := f(duration)
-        if err == nil {
-            sink(values)
-        }
-    }
-}
-
-func ReadLoop(interval time.Duration, duration time.Duration, reads []ReadFunc, sink SinkFunc) {
-    ticker := time.NewTicker(interval)
-    done := make(chan bool)
-    sigs := make(chan os.Signal, 1)
-    signal.Notify(sigs, os.Interrupt)
-    ReadAll(duration, reads, sink)
-    go func() {
-        <-sigs
-        // Should call all CloseFunc functions here
-        os.Exit(1)
-    }()
-    func() {
-        select {
-        case <-done:
-            return
-        case t := <-ticker.C:
-            fmt.Println("Tick at", t)
-            ReadAll(duration, reads, sink)
-        }
-    }()
-    ticker.Stop()
-    done <- true
-}
-
-func main() {
-    //var inits []InitFunc
-    var reads = []ReadFunc {read_memavg, read_loadavg, read_netstat}
-    //var closes []CloseFunc
-    var duration time.Duration
-    var interval time.Duration
-    duration = time.Duration(1) * time.Second
-    interval = time.Duration(10) * time.Second
-    ReadLoop(interval, duration, reads, Send)
-    return
-}
diff --git a/collectors/infinibandMetric.go b/collectors/infinibandMetric.go
new file mode 100644
index 0000000..0bb49bb
--- /dev/null
+++ b/collectors/infinibandMetric.go
@@ -0,0 +1,64 @@
+package collectors
+
+import (
+	"fmt"
+	"io/ioutil"
+	"log"
+	"os/exec"
+	"strconv"
+	"strings"
+	"time"
+)
+
+const LIDFILE = `/sys/class/infiniband/mlx4_0/ports/1/lid`
+
+type InfinibandCollector struct {
+	MetricCollector
+}
+
+func (m *InfinibandCollector) Init() {
+	m.name = "InfinibandCollector"
+	m.setup()
+}
+
+func (m *InfinibandCollector) Read(interval time.Duration) {
+	buffer, err := ioutil.ReadFile(string(LIDFILE))
+
+	if err != nil {
+		log.Print(err)
+		return
+	}
+
+	args := fmt.Sprintf("-r %s 1 0xf000", string(buffer))
+
+	command := exec.Command("/usr/sbin/perfquery", args)
+	command.Wait()
+	stdout, err := command.Output()
+	if err != nil {
+		log.Print(err)
+		return
+	}
+
+	ll := strings.Split(string(stdout), "\n")
+
+	for _, line := range ll {
+		if strings.HasPrefix(line, "PortRcvData") || strings.HasPrefix(line, "RcvData") {
+			lv := strings.Fields(line)
+			v, err := strconv.ParseFloat(lv[1], 64)
+			if err == nil {
+				m.node["ib_recv"] = float64(v)
+			}
+		}
+		if strings.HasPrefix(line, "PortXmitData") || strings.HasPrefix(line, "XmtData") {
+			lv := strings.Fields(line)
+			v, err := strconv.ParseFloat(lv[1], 64)
+			if err == nil {
+				m.node["ib_xmit"] = float64(v)
+			}
+		}
+	}
+}
+
+func (m *InfinibandCollector) Close() {
+	return
+}
diff --git a/collectors/likwid.go b/collectors/likwid.go
deleted file mode 100644
index 829b5c3..0000000
--- a/collectors/likwid.go
+++ /dev/null
@@ -1,64 +0,0 @@
-package collectors
-
-import (
-	"bytes"
-	"fmt"
-	"time"
-
-	protocol "github.com/influxdata/line-protocol"
-)
-
-type LikwidCollector struct {
-	name    string
-	tags    []*protocol.Tag
-	fields  []*protocol.Field
-	t       time.Time
-	encoder *protocol.Encoder
-}
-
-func (c *LikwidCollector) Name() string {
-	return c.name
-}
-func (c *LikwidCollector) TagList() []*protocol.Tag {
-	return c.tags
-}
-
-func (c *LikwidCollector) FieldList() []*protocol.Field {
-	return c.fields
-}
-
-func (c *LikwidCollector) Time() time.Time {
-	return c.t
-}
-
-func (c *LikwidCollector) New() {
-	buf := &bytes.Buffer{}
-	c.encoder = protocol.NewEncoder(buf)
-	c.encoder.SetMaxLineBytes(1024)
-}
-
-func (c *LikwidCollector) Start(
-	level string,
-	frequency time.Duration,
-	duration int) {
-	ticker := time.NewTicker(frequency * time.Second)
-	done := make(chan bool)
-
-	go func() {
-		for {
-			select {
-			case <-done:
-				return
-			case t := <-ticker.C:
-				fmt.Println("Tick at", t)
-
-				c.encoder.Encode(c)
-			}
-		}
-	}()
-
-	time.Sleep(1600 * time.Second)
-	ticker.Stop()
-	done <- true
-	fmt.Println("Ticker stopped")
-}
diff --git a/collectors/likwid/bstrlib.h b/collectors/likwid/bstrlib.h
new file mode 100644
index 0000000..02a836e
--- /dev/null
+++ b/collectors/likwid/bstrlib.h
@@ -0,0 +1,301 @@
+/*
+ * =======================================================================================
+ * This source file is part of the bstring string library.  This code was
+ * written by Paul Hsieh in 2002-2008, and is covered by the BSD open source
+ * license and the GPL. Refer to the accompanying documentation for details
+ * on usage and license.
+ */
+/*
+ * bstrlib.c
+ *
+ * This file is the core module for implementing the bstring functions.
+ */
+
+#ifndef BSTRLIB_INCLUDE
+#define BSTRLIB_INCLUDE
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdarg.h>
+#include <string.h>
+#include <limits.h>
+#include <ctype.h>
+
+#if !defined (BSTRLIB_VSNP_OK) && !defined (BSTRLIB_NOVSNP)
+# if defined (__TURBOC__) && !defined (__BORLANDC__)
+#  define BSTRLIB_NOVSNP
+# endif
+#endif
+
+#define BSTR_ERR (-1)
+#define BSTR_OK (0)
+#define BSTR_BS_BUFF_LENGTH_GET (0)
+
+typedef struct tagbstring * bstring;
+typedef const struct tagbstring * const_bstring;
+
+/* Copy functions */
+#define cstr2bstr bfromcstr
+extern bstring bfromcstr (const char * str);
+extern bstring bfromcstralloc (int mlen, const char * str);
+extern bstring blk2bstr (const void * blk, int len);
+extern char * bstr2cstr (const_bstring s, char z);
+extern int bcstrfree (char * s);
+extern bstring bstrcpy (const_bstring b1);
+extern int bassign (bstring a, const_bstring b);
+extern int bassignmidstr (bstring a, const_bstring b, int left, int len);
+extern int bassigncstr (bstring a, const char * str);
+extern int bassignblk (bstring a, const void * s, int len);
+
+/* Destroy function */
+extern int bdestroy (bstring b);
+
+/* Space allocation hinting functions */
+extern int balloc (bstring s, int len);
+extern int ballocmin (bstring b, int len);
+
+/* Substring extraction */
+extern bstring bmidstr (const_bstring b, int left, int len);
+
+/* Various standard manipulations */
+extern int bconcat (bstring b0, const_bstring b1);
+extern int bconchar (bstring b0, char c);
+extern int bcatcstr (bstring b, const char * s);
+extern int bcatblk (bstring b, const void * s, int len);
+extern int binsert (bstring s1, int pos, const_bstring s2, unsigned char fill);
+extern int binsertch (bstring s1, int pos, int len, unsigned char fill);
+extern int breplace (bstring b1, int pos, int len, const_bstring b2, unsigned char fill);
+extern int bdelete (bstring s1, int pos, int len);
+extern int bsetstr (bstring b0, int pos, const_bstring b1, unsigned char fill);
+extern int btrunc (bstring b, int n);
+
+/* Scan/search functions */
+extern int bstricmp (const_bstring b0, const_bstring b1);
+extern int bstrnicmp (const_bstring b0, const_bstring b1, int n);
+extern int biseqcaseless (const_bstring b0, const_bstring b1);
+extern int bisstemeqcaselessblk (const_bstring b0, const void * blk, int len);
+extern int biseq (const_bstring b0, const_bstring b1);
+extern int bisstemeqblk (const_bstring b0, const void * blk, int len);
+extern int biseqcstr (const_bstring b, const char * s);
+extern int biseqcstrcaseless (const_bstring b, const char * s);
+extern int bstrcmp (const_bstring b0, const_bstring b1);
+extern int bstrncmp (const_bstring b0, const_bstring b1, int n);
+extern int binstr (const_bstring s1, int pos, const_bstring s2);
+extern int binstrr (const_bstring s1, int pos, const_bstring s2);
+extern int binstrcaseless (const_bstring s1, int pos, const_bstring s2);
+extern int binstrrcaseless (const_bstring s1, int pos, const_bstring s2);
+extern int bstrchrp (const_bstring b, int c, int pos);
+extern int bstrrchrp (const_bstring b, int c, int pos);
+#define bstrchr(b,c) bstrchrp ((b), (c), 0)
+#define bstrrchr(b,c) bstrrchrp ((b), (c), blength(b)-1)
+extern int binchr (const_bstring b0, int pos, const_bstring b1);
+extern int binchrr (const_bstring b0, int pos, const_bstring b1);
+extern int bninchr (const_bstring b0, int pos, const_bstring b1);
+extern int bninchrr (const_bstring b0, int pos, const_bstring b1);
+extern int bfindreplace (bstring b, const_bstring find, const_bstring repl, int pos);
+extern int bfindreplacecaseless (bstring b, const_bstring find, const_bstring repl, int pos);
+
+/* List of string container functions */
+struct bstrList {
+    int qty, mlen;
+    bstring * entry;
+};
+extern struct bstrList * bstrListCreate (void);
+extern int bstrListDestroy (struct bstrList * sl);
+extern int bstrListAlloc (struct bstrList * sl, int msz);
+extern int bstrListAllocMin (struct bstrList * sl, int msz);
+
+/* String split and join functions */
+extern struct bstrList * bsplit (const_bstring str, unsigned char splitChar);
+extern struct bstrList * bsplits (const_bstring str, const_bstring splitStr);
+extern struct bstrList * bsplitstr (const_bstring str, const_bstring splitStr);
+extern bstring bjoin (const struct bstrList * bl, const_bstring sep);
+extern int bsplitcb (const_bstring str, unsigned char splitChar, int pos,
+    int (* cb) (void * parm, int ofs, int len), void * parm);
+extern int bsplitscb (const_bstring str, const_bstring splitStr, int pos,
+    int (* cb) (void * parm, int ofs, int len), void * parm);
+extern int bsplitstrcb (const_bstring str, const_bstring splitStr, int pos,
+    int (* cb) (void * parm, int ofs, int len), void * parm);
+
+/* Miscellaneous functions */
+extern int bpattern (bstring b, int len);
+extern int btoupper (bstring b);
+extern int btolower (bstring b);
+extern int bltrimws (bstring b);
+extern int brtrimws (bstring b);
+extern int btrimws (bstring b);
+
+#if !defined (BSTRLIB_NOVSNP)
+extern bstring bformat (const char * fmt, ...);
+extern int bformata (bstring b, const char * fmt, ...);
+extern int bassignformat (bstring b, const char * fmt, ...);
+extern int bvcformata (bstring b, int count, const char * fmt, va_list arglist);
+
+#define bvformata(ret, b, fmt, lastarg) { \
+bstring bstrtmp_b = (b); \
+const char * bstrtmp_fmt = (fmt); \
+int bstrtmp_r = BSTR_ERR, bstrtmp_sz = 16; \
+    for (;;) { \
+        va_list bstrtmp_arglist; \
+        va_start (bstrtmp_arglist, lastarg); \
+        bstrtmp_r = bvcformata (bstrtmp_b, bstrtmp_sz, bstrtmp_fmt, bstrtmp_arglist); \
+        va_end (bstrtmp_arglist); \
+        if (bstrtmp_r >= 0) { /* Everything went ok */ \
+            bstrtmp_r = BSTR_OK; \
+            break; \
+        } else if (-bstrtmp_r <= bstrtmp_sz) { /* A real error? */ \
+            bstrtmp_r = BSTR_ERR; \
+            break; \
+        } \
+        bstrtmp_sz = -bstrtmp_r; /* Doubled or target size */ \
+    } \
+    ret = bstrtmp_r; \
+}
+
+#endif
+
+typedef int (*bNgetc) (void *parm);
+typedef size_t (* bNread) (void *buff, size_t elsize, size_t nelem, void *parm);
+
+/* Input functions */
+extern bstring bgets (bNgetc getcPtr, void * parm, char terminator);
+extern bstring bread (bNread readPtr, void * parm);
+extern int bgetsa (bstring b, bNgetc getcPtr, void * parm, char terminator);
+extern int bassigngets (bstring b, bNgetc getcPtr, void * parm, char terminator);
+extern int breada (bstring b, bNread readPtr, void * parm);
+
+/* Stream functions */
+extern struct bStream * bsopen (bNread readPtr, void * parm);
+extern void * bsclose (struct bStream * s);
+extern int bsbufflength (struct bStream * s, int sz);
+extern int bsreadln (bstring b, struct bStream * s, char terminator);
+extern int bsreadlns (bstring r, struct bStream * s, const_bstring term);
+extern int bsread (bstring b, struct bStream * s, int n);
+extern int bsreadlna (bstring b, struct bStream * s, char terminator);
+extern int bsreadlnsa (bstring r, struct bStream * s, const_bstring term);
+extern int bsreada (bstring b, struct bStream * s, int n);
+extern int bsunread (struct bStream * s, const_bstring b);
+extern int bspeek (bstring r, const struct bStream * s);
+extern int bssplitscb (struct bStream * s, const_bstring splitStr,
+    int (* cb) (void * parm, int ofs, const_bstring entry), void * parm);
+extern int bssplitstrcb (struct bStream * s, const_bstring splitStr,
+    int (* cb) (void * parm, int ofs, const_bstring entry), void * parm);
+extern int bseof (const struct bStream * s);
+
+struct tagbstring {
+    int mlen;
+    int slen;
+    unsigned char * data;
+};
+
+/* Accessor macros */
+#define blengthe(b, e)      (((b) == (void *)0 || (b)->slen < 0) ? (int)(e) : ((b)->slen))
+#define blength(b)          (blengthe ((b), 0))
+#define bdataofse(b, o, e)  (((b) == (void *)0 || (b)->data == (void*)0) ? (char *)(e) : ((char *)(b)->data) + (o))
+#define bdataofs(b, o)      (bdataofse ((b), (o), (void *)0))
+#define bdatae(b, e)        (bdataofse (b, 0, e))
+#define bdata(b)            (bdataofs (b, 0))
+#define bchare(b, p, e)     ((((unsigned)(p)) < (unsigned)blength(b)) ? ((b)->data[(p)]) : (e))
+#define bchar(b, p)         bchare ((b), (p), '\0')
+
+/* Static constant string initialization macro */
+#define bsStaticMlen(q,m)   {(m), (int) sizeof(q)-1, (unsigned char *) ("" q "")}
+#if defined(_MSC_VER)
+# define bsStatic(q)        bsStaticMlen(q,-32)
+#endif
+#ifndef bsStatic
+# define bsStatic(q)        bsStaticMlen(q,-__LINE__)
+#endif
+
+/* Static constant block parameter pair */
+#define bsStaticBlkParms(q) ((void *)("" q "")), ((int) sizeof(q)-1)
+
+/* Reference building macros */
+#define cstr2tbstr btfromcstr
+#define btfromcstr(t,s) {                                            \
+    (t).data = (unsigned char *) (s);                                \
+    (t).slen = ((t).data) ? ((int) (strlen) ((char *)(t).data)) : 0; \
+    (t).mlen = -1;                                                   \
+}
+#define blk2tbstr(t,s,l) {            \
+    (t).data = (unsigned char *) (s); \
+    (t).slen = l;                     \
+    (t).mlen = -1;                    \
+}
+#define btfromblk(t,s,l) blk2tbstr(t,s,l)
+#define bmid2tbstr(t,b,p,l) {                                                \
+    const_bstring bstrtmp_s = (b);                                           \
+    if (bstrtmp_s && bstrtmp_s->data && bstrtmp_s->slen >= 0) {              \
+        int bstrtmp_left = (p);                                              \
+        int bstrtmp_len  = (l);                                              \
+        if (bstrtmp_left < 0) {                                              \
+            bstrtmp_len += bstrtmp_left;                                     \
+            bstrtmp_left = 0;                                                \
+        }                                                                    \
+        if (bstrtmp_len > bstrtmp_s->slen - bstrtmp_left)                    \
+            bstrtmp_len = bstrtmp_s->slen - bstrtmp_left;                    \
+        if (bstrtmp_len <= 0) {                                              \
+            (t).data = (unsigned char *)"";                                  \
+            (t).slen = 0;                                                    \
+        } else {                                                             \
+            (t).data = bstrtmp_s->data + bstrtmp_left;                       \
+            (t).slen = bstrtmp_len;                                          \
+        }                                                                    \
+    } else {                                                                 \
+        (t).data = (unsigned char *)"";                                      \
+        (t).slen = 0;                                                        \
+    }                                                                        \
+    (t).mlen = -__LINE__;                                                    \
+}
+#define btfromblkltrimws(t,s,l) {                                            \
+    int bstrtmp_idx = 0, bstrtmp_len = (l);                                  \
+    unsigned char * bstrtmp_s = (s);                                         \
+    if (bstrtmp_s && bstrtmp_len >= 0) {                                     \
+        for (; bstrtmp_idx < bstrtmp_len; bstrtmp_idx++) {                   \
+            if (!isspace (bstrtmp_s[bstrtmp_idx])) break;                    \
+        }                                                                    \
+    }                                                                        \
+    (t).data = bstrtmp_s + bstrtmp_idx;                                      \
+    (t).slen = bstrtmp_len - bstrtmp_idx;                                    \
+    (t).mlen = -__LINE__;                                                    \
+}
+#define btfromblkrtrimws(t,s,l) {                                            \
+    int bstrtmp_len = (l) - 1;                                               \
+    unsigned char * bstrtmp_s = (s);                                         \
+    if (bstrtmp_s && bstrtmp_len >= 0) {                                     \
+        for (; bstrtmp_len >= 0; bstrtmp_len--) {                            \
+            if (!isspace (bstrtmp_s[bstrtmp_len])) break;                    \
+        }                                                                    \
+    }                                                                        \
+    (t).data = bstrtmp_s;                                                    \
+    (t).slen = bstrtmp_len + 1;                                              \
+    (t).mlen = -__LINE__;                                                    \
+}
+#define btfromblktrimws(t,s,l) {                                             \
+    int bstrtmp_idx = 0, bstrtmp_len = (l) - 1;                              \
+    unsigned char * bstrtmp_s = (s);                                         \
+    if (bstrtmp_s && bstrtmp_len >= 0) {                                     \
+        for (; bstrtmp_idx <= bstrtmp_len; bstrtmp_idx++) {                  \
+            if (!isspace (bstrtmp_s[bstrtmp_idx])) break;                    \
+        }                                                                    \
+        for (; bstrtmp_len >= bstrtmp_idx; bstrtmp_len--) {                  \
+            if (!isspace (bstrtmp_s[bstrtmp_len])) break;                    \
+        }                                                                    \
+    }                                                                        \
+    (t).data = bstrtmp_s + bstrtmp_idx;                                      \
+    (t).slen = bstrtmp_len + 1 - bstrtmp_idx;                                \
+    (t).mlen = -__LINE__;                                                    \
+}
+
+/* Write protection macros */
+#define bwriteprotect(t)     { if ((t).mlen >=  0) (t).mlen = -1; }
+#define bwriteallow(t)       { if ((t).mlen == -1) (t).mlen = (t).slen + ((t).slen == 0); }
+#define biswriteprotected(t) ((t).mlen <= 0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/collectors/likwid/groups/CLX/BRANCH.txt b/collectors/likwid/groups/CLX/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/collectors/likwid/groups/CLX/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/CLX/CACHES.txt b/collectors/likwid/groups/CLX/CACHES.txt
new file mode 100644
index 0000000..c700dd4
--- /dev/null
+++ b/collectors/likwid/groups/CLX/CACHES.txt
@@ -0,0 +1,143 @@
+SHORT Cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L1D_M_EVICT
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_TRANS_L2_WB
+CBOX0C1 LLC_VICTIMS_M_STATE
+CBOX1C1 LLC_VICTIMS_M_STATE
+CBOX2C1 LLC_VICTIMS_M_STATE
+CBOX3C1 LLC_VICTIMS_M_STATE
+CBOX4C1 LLC_VICTIMS_M_STATE
+CBOX5C1 LLC_VICTIMS_M_STATE
+CBOX6C1 LLC_VICTIMS_M_STATE
+CBOX7C1 LLC_VICTIMS_M_STATE
+CBOX8C1 LLC_VICTIMS_M_STATE
+CBOX9C1 LLC_VICTIMS_M_STATE
+CBOX10C1 LLC_VICTIMS_M_STATE
+CBOX11C1 LLC_VICTIMS_M_STATE
+CBOX12C1 LLC_VICTIMS_M_STATE
+CBOX13C1 LLC_VICTIMS_M_STATE
+CBOX14C1 LLC_VICTIMS_M_STATE
+CBOX15C1 LLC_VICTIMS_M_STATE
+CBOX16C1 LLC_VICTIMS_M_STATE
+CBOX17C1 LLC_VICTIMS_M_STATE
+CBOX18C1 LLC_VICTIMS_M_STATE
+CBOX19C1 LLC_VICTIMS_M_STATE
+CBOX20C1 LLC_VICTIMS_M_STATE
+CBOX21C1 LLC_VICTIMS_M_STATE
+CBOX22C1 LLC_VICTIMS_M_STATE
+CBOX23C1 LLC_VICTIMS_M_STATE
+CBOX24C1 LLC_VICTIMS_M_STATE
+CBOX25C1 LLC_VICTIMS_M_STATE
+CBOX26C1 LLC_VICTIMS_M_STATE
+CBOX27C1 LLC_VICTIMS_M_STATE
+CBOX0C0 LLC_LOOKUP_DATA_READ
+CBOX1C0 LLC_LOOKUP_DATA_READ
+CBOX2C0 LLC_LOOKUP_DATA_READ
+CBOX3C0 LLC_LOOKUP_DATA_READ
+CBOX4C0 LLC_LOOKUP_DATA_READ
+CBOX5C0 LLC_LOOKUP_DATA_READ
+CBOX6C0 LLC_LOOKUP_DATA_READ
+CBOX7C0 LLC_LOOKUP_DATA_READ
+CBOX8C0 LLC_LOOKUP_DATA_READ
+CBOX9C0 LLC_LOOKUP_DATA_READ
+CBOX10C0 LLC_LOOKUP_DATA_READ
+CBOX11C0 LLC_LOOKUP_DATA_READ
+CBOX12C0 LLC_LOOKUP_DATA_READ
+CBOX13C0 LLC_LOOKUP_DATA_READ
+CBOX14C0 LLC_LOOKUP_DATA_READ
+CBOX15C0 LLC_LOOKUP_DATA_READ
+CBOX16C0 LLC_LOOKUP_DATA_READ
+CBOX17C0 LLC_LOOKUP_DATA_READ
+CBOX18C0 LLC_LOOKUP_DATA_READ
+CBOX19C0 LLC_LOOKUP_DATA_READ
+CBOX20C0 LLC_LOOKUP_DATA_READ
+CBOX21C0 LLC_LOOKUP_DATA_READ
+CBOX22C0 LLC_LOOKUP_DATA_READ
+CBOX23C0 LLC_LOOKUP_DATA_READ
+CBOX24C0 LLC_LOOKUP_DATA_READ
+CBOX25C0 LLC_LOOKUP_DATA_READ
+CBOX26C0 LLC_LOOKUP_DATA_READ
+CBOX27C0 LLC_LOOKUP_DATA_READ
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L3 to L2 load bandwidth [MBytes/s]  1.0E-06*PMC2*64.0/time
+L3 to L2 load data volume [GBytes]  1.0E-09*PMC2*64.0
+L2 to L3 evict bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time
+L2 to L3 evict data volume [GBytes]  1.0E-09*PMC3*64.0
+L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0)*64.0/time
+System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0)*64.0
+L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64/time
+L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64
+L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64.0/time
+L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64.0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
+
+LONG
+Formulas:
+L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
+L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64
+L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
+L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64
+L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
+L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
+L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time
+L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64
+L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time
+L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64
+L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ))*64/time
+System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ))*64
+L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M_STATE))*64/time
+L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M_STATE))*64
+L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M_STATE))*64/time
+L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M_STATE))*64
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
+-
+Group to measure cache transfers between L1 and Memory. Please notice that the
+L3 to/from system metrics contain any traffic to the system (memory,
+Intel QPI, etc.) but don't seem to handle anything because commonly memory read
+bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth.
+
diff --git a/collectors/likwid/groups/CLX/CLOCK.txt b/collectors/likwid/groups/CLX/CLOCK.txt
new file mode 100644
index 0000000..b81bee6
--- /dev/null
+++ b/collectors/likwid/groups/CLX/CLOCK.txt
@@ -0,0 +1,26 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+UBOXFIX UNCORE_CLOCK
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+Uncore Clock [MHz] 1.E-06*UBOXFIX/time
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+
+LONG
+Formulas:
+Power =  PWR_PKG_ENERGY / time
+Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/collectors/likwid/groups/CLX/CYCLE_ACTIVITY.txt b/collectors/likwid/groups/CLX/CYCLE_ACTIVITY.txt
new file mode 100644
index 0000000..c432a44
--- /dev/null
+++ b/collectors/likwid/groups/CLX/CYCLE_ACTIVITY.txt
@@ -0,0 +1,38 @@
+SHORT Cycle Activities
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING
+PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING
+PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING
+PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Cycles without execution [%] (PMC3/FIXC1)*100
+Cycles without execution due to L1D [%] (PMC2/FIXC1)*100
+Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
+Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100
+
+LONG
+Formulas:
+Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
+Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
+Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
+Cycles without execution due to memory loads [%] = CYCLE_ACTIVITY_CYCLES_LDM_PENDING/CPU_CLK_UNHALTED_CORE*100
+--
+This performance group measures the cycles while waiting for data from the cache
+and memory hierarchy.
+CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on
+any execution port.
+CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is
+outstanding.
+CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is
+outstanding.
+CYCLE_ACTIVITY_CYCLES_LDM_PENDING: Cycles while memory subsystem has an
+outstanding load.
diff --git a/collectors/likwid/groups/CLX/CYCLE_STALLS.txt b/collectors/likwid/groups/CLX/CYCLE_STALLS.txt
new file mode 100644
index 0000000..795aeb9
--- /dev/null
+++ b/collectors/likwid/groups/CLX/CYCLE_STALLS.txt
@@ -0,0 +1,45 @@
+SHORT Cycle Activities (Stalls)
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING
+PMC1 CYCLE_ACTIVITY_STALLS_LDM_PENDING
+PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING
+PMC3 CYCLE_ACTIVITY_STALLS_TOTAL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Total execution stalls PMC3
+Stalls caused by L1D misses [%] (PMC2/PMC3)*100
+Stalls caused by L2 misses [%] (PMC0/PMC3)*100
+Stalls caused by memory loads [%] (PMC1/PMC3)*100
+Execution stall rate [%] (PMC3/FIXC1)*100
+Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100
+Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
+Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100
+
+LONG
+Formulas:
+Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
+Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Stalls caused by memory loads [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by memory loads rate [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CPU_CLK_UNHALTED_CORE)*100
+--
+This performance group measures the stalls caused by data traffic in the cache
+hierarchy.
+CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls.
+CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand
+load is outstanding.
+CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand
+load is outstanding.
+CYCLE_ACTIVITY_STALLS_LDM_PENDING: Execution stalls while memory subsystem has
+an outstanding load.
diff --git a/collectors/likwid/groups/CLX/DATA.txt b/collectors/likwid/groups/CLX/DATA.txt
new file mode 100644
index 0000000..4e6e938
--- /dev/null
+++ b/collectors/likwid/groups/CLX/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_INST_RETIRED_ALL_LOADS
+PMC1  MEM_INST_RETIRED_ALL_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_INST_RETIRED_ALL_LOADS/MEM_INST_RETIRED_ALL_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/CLX/DIVIDE.txt b/collectors/likwid/groups/CLX/DIVIDE.txt
new file mode 100644
index 0000000..2c6222d
--- /dev/null
+++ b/collectors/likwid/groups/CLX/DIVIDE.txt
@@ -0,0 +1,24 @@
+SHORT Divide unit information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ARITH_DIVIDER_COUNT
+PMC1  ARITH_DIVIDER_ACTIVE
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Number of divide ops PMC0
+Avg. divide unit usage duration PMC1/PMC0
+
+LONG
+Formulas:
+Number of divide ops = ARITH_DIVIDER_COUNT
+Avg. divide unit usage duration = ARITH_DIVIDER_ACTIVE/ARITH_DIVIDER_COUNT
+--
+This performance group measures the average latency of divide operations
diff --git a/collectors/likwid/groups/CLX/ENERGY.txt b/collectors/likwid/groups/CLX/ENERGY.txt
new file mode 100644
index 0000000..fe7829f
--- /dev/null
+++ b/collectors/likwid/groups/CLX/ENERGY.txt
@@ -0,0 +1,35 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formulas:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket)  and DRAM level.
+
diff --git a/collectors/likwid/groups/CLX/FLOPS_AVX.txt b/collectors/likwid/groups/CLX/FLOPS_AVX.txt
new file mode 100644
index 0000000..e44a913
--- /dev/null
+++ b/collectors/likwid/groups/CLX/FLOPS_AVX.txt
@@ -0,0 +1,25 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
+PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0+PMC2*16.0)/time
+Packed DP [MFLOP/s]  1.0E-06*(PMC1*4.0+PMC3*8.0)/time
+
+LONG
+Formulas:
+Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+-
+Packed 32b AVX FLOPs rates.
diff --git a/collectors/likwid/groups/CLX/FLOPS_DP.txt b/collectors/likwid/groups/CLX/FLOPS_DP.txt
new file mode 100644
index 0000000..7d6af79
--- /dev/null
+++ b/collectors/likwid/groups/CLX/FLOPS_DP.txt
@@ -0,0 +1,34 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time
+AVX DP [MFLOP/s]  1.0E-06*(PMC2*4.0+PMC3*8.0)/time
+AVX512 DP [MFLOP/s]  1.0E-06*(PMC3*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+AVX512 DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)
+-
+SSE scalar and packed double precision FLOP rates.
+
diff --git a/collectors/likwid/groups/CLX/FLOPS_SP.txt b/collectors/likwid/groups/CLX/FLOPS_SP.txt
new file mode 100644
index 0000000..39fb08d
--- /dev/null
+++ b/collectors/likwid/groups/CLX/FLOPS_SP.txt
@@ -0,0 +1,34 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time
+AVX SP [MFLOP/s]  1.0E-06*(PMC2*8.0+PMC3*16.0)/time
+AVX512 SP [MFLOP/s]  1.0E-06*(PMC3*16.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
+
+LONG
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+AVX512 SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)
+-
+SSE scalar and packed single precision FLOP rates.
+
diff --git a/collectors/likwid/groups/CLX/L2.txt b/collectors/likwid/groups/CLX/L2.txt
new file mode 100644
index 0000000..1a92a95
--- /dev/null
+++ b/collectors/likwid/groups/CLX/L2.txt
@@ -0,0 +1,38 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L1D_M_EVICT
+PMC2  ICACHE_64B_IFTAG_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_64B_IFTAG_MISS)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_64B_IFTAG_MISS)*64
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L1 and the number of modified cache lines
+evicted from the L1. The group also output total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
+
diff --git a/collectors/likwid/groups/CLX/L2CACHE.txt b/collectors/likwid/groups/CLX/L2CACHE.txt
new file mode 100644
index 0000000..9b5dd4b
--- /dev/null
+++ b/collectors/likwid/groups/CLX/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_TRANS_ALL_REQUESTS
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/CLX/L3.txt b/collectors/likwid/groups/CLX/L3.txt
new file mode 100644
index 0000000..98d1d9e
--- /dev/null
+++ b/collectors/likwid/groups/CLX/L3.txt
@@ -0,0 +1,36 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ALL
+PMC1  L2_TRANS_L2_WB
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/collectors/likwid/groups/CLX/L3CACHE.txt b/collectors/likwid/groups/CLX/L3CACHE.txt
new file mode 100644
index 0000000..bc664d1
--- /dev/null
+++ b/collectors/likwid/groups/CLX/L3CACHE.txt
@@ -0,0 +1,35 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_LOAD_RETIRED_L3_HIT
+PMC1  MEM_LOAD_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate (PMC0+PMC1)/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/(PMC0+PMC1)
+
+LONG
+Formulas:
+L3 request rate = (MEM_LOAD_RETIRED_L3_HIT+MEM_LOAD_RETIRED_L3_MISS)/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/(MEM_LOAD_RETIRED_L3_HIT+MEM_LOAD_RETIRED_L3_MISS)
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/CLX/MEM.txt b/collectors/likwid/groups/CLX/MEM.txt
new file mode 100644
index 0000000..3d50ecb
--- /dev/null
+++ b/collectors/likwid/groups/CLX/MEM.txt
@@ -0,0 +1,48 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on a
+per socket base. Some of the counters may not be available on your system.
+Also outputs total data volume transferred from main memory.
+The same metrics are provided by the HA group.
+
diff --git a/collectors/likwid/groups/CLX/MEM_DP.txt b/collectors/likwid/groups/CLX/MEM_DP.txt
new file mode 100644
index 0000000..68e8684
--- /dev/null
+++ b/collectors/likwid/groups/CLX/MEM_DP.txt
@@ -0,0 +1,70 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time
+AVX DP [MFLOP/s]  1.0E-06*(PMC2*4.0+PMC3*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
+Operational intensity (PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0)
+
+LONG
+Formulas:
+Power [W] = PWR_PKG_ENERGY/runtime
+Power DRAM [W] = PWR_DRAM_ENERGY/runtime
+DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
+Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0)
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed double precision FLOP rates. Also reports on packed AVX
+32b instructions.
+The operational intensity is calculated using the FP values of the cores and the
+memory data volume of the whole socket. The actual operational intensity for
+multiple CPUs can be found in the statistics table in the Sum column.
diff --git a/collectors/likwid/groups/CLX/MEM_SP.txt b/collectors/likwid/groups/CLX/MEM_SP.txt
new file mode 100644
index 0000000..73452f2
--- /dev/null
+++ b/collectors/likwid/groups/CLX/MEM_SP.txt
@@ -0,0 +1,70 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time
+AVX SP [MFLOP/s]  1.0E-06*(PMC2*8.0+PMC3*16.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
+Operational intensity (PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0)
+
+LONG
+Formulas:
+Power [W] = PWR_PKG_ENERGY/runtime
+Power DRAM [W] = PWR_DRAM_ENERGY/runtime
+SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
+Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0)
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed single precision FLOP rates. Also reports on packed AVX
+32b instructions.
+The operational intensity is calculated using the FP values of the cores and the
+memory data volume of the whole socket. The actual operational intensity for
+multiple CPUs can be found in the statistics table in the Sum column.
diff --git a/collectors/likwid/groups/CLX/PMM.txt b/collectors/likwid/groups/CLX/PMM.txt
new file mode 100644
index 0000000..dbaa6ab
--- /dev/null
+++ b/collectors/likwid/groups/CLX/PMM.txt
@@ -0,0 +1,46 @@
+SHORT Intel Optance DC bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+MBOX0C0 PMM_CMD1_RD
+MBOX0C1 PMM_CMD1_WR
+MBOX1C0 PMM_CMD1_RD
+MBOX1C1 PMM_CMD1_WR
+MBOX2C0 PMM_CMD1_RD
+MBOX2C1 PMM_CMD1_WR
+MBOX3C0 PMM_CMD1_RD
+MBOX3C1 PMM_CMD1_WR
+MBOX4C0 PMM_CMD1_RD
+MBOX4C1 PMM_CMD1_WR
+MBOX5C0 PMM_CMD1_RD
+MBOX5C1 PMM_CMD1_WR
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+PMM read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time
+PMM read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0
+PMM write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
+PMM write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
+PMM bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
+PMM data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
+
+LONG
+Formulas:
+PMM read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
+PMM read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+PMM write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
+PMM write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+PMM bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
+PMM data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+-
+Profiling group to measure data rate and volume for accesses to Intel Optane DC
+persistent memory. The Intel Optance DC devices are handled by the memory
+controllers but require different events.
+
diff --git a/collectors/likwid/groups/CLX/TLB_DATA.txt b/collectors/likwid/groups/CLX/TLB_DATA.txt
new file mode 100644
index 0000000..10ee5e1
--- /dev/null
+++ b/collectors/likwid/groups/CLX/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1  DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2  DTLB_LOAD_MISSES_WALK_ACTIVE
+PMC3  DTLB_STORE_MISSES_WALK_ACTIVE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses     PMC1
+L1 DTLB store miss rate  PMC1/FIXC0
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_ACTIVE / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_ACTIVE / DTLB_STORE_MISSES_CAUSES_A_WALK
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/CLX/TLB_INSTR.txt b/collectors/likwid/groups/CLX/TLB_INSTR.txt
new file mode 100644
index 0000000..9bc65a7
--- /dev/null
+++ b/collectors/likwid/groups/CLX/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_CAUSES_A_WALK
+PMC1  ITLB_MISSES_WALK_ACTIVE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_ACTIVE / ITLB_MISSES_CAUSES_A_WALK
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/CLX/TMA.txt b/collectors/likwid/groups/CLX/TMA.txt
new file mode 100644
index 0000000..afb4126
--- /dev/null
+++ b/collectors/likwid/groups/CLX/TMA.txt
@@ -0,0 +1,48 @@
+SHORT Top down cycle allocation
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_ANY
+PMC1 UOPS_RETIRED_RETIRE_SLOTS
+PMC2 IDQ_UOPS_NOT_DELIVERED_CORE
+PMC3 INT_MISC_RECOVERY_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+IPC FIXC0/FIXC1
+Total Slots 4*FIXC1
+Slots Retired PMC1
+Fetch Bubbles PMC2
+Recovery Bubbles 4*PMC3
+Front End [%] PMC2/(4*FIXC1)*100
+Speculation [%] (PMC0-PMC1+(4*PMC3))/(4*FIXC1)*100
+Retiring [%] PMC1/(4*FIXC1)*100
+Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100
+
+LONG
+Formulas:
+Total Slots = 4*CPU_CLK_UNHALTED_CORE
+Slots Retired = UOPS_RETIRED_RETIRE_SLOTS
+Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE
+Recovery Bubbles = 4*INT_MISC_RECOVERY_CYCLES
+Front End [%] = IDQ_UOPS_NOT_DELIVERED_CORE/(4*CPU_CLK_UNHALTED_CORE)*100
+Speculation [%] = (UOPS_ISSUED_ANY-UOPS_RETIRED_RETIRE_SLOTS+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)*100
+Retiring [%] = UOPS_RETIRED_RETIRE_SLOTS/(4*CPU_CLK_UNHALTED_CORE)*100
+Back End [%] = (1-((IDQ_UOPS_NOT_DELIVERED_CORE+UOPS_ISSUED_ANY+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)))*100
+--
+This performance group measures cycles to determine percentage of time spent in
+front end, back end, retiring and speculation. These metrics are published and
+verified by Intel. Further information:
+Webpage describing Top-Down Method and its usage in Intel vTune:
+https://software.intel.com/en-us/vtune-amplifier-help-tuning-applications-using-a-top-down-microarchitecture-analysis-method
+Paper by Yasin Ahmad:
+https://sites.google.com/site/analysismethods/yasin-pubs/TopDown-Yasin-ISPASS14.pdf?attredirects=0
+Slides by Yasin Ahmad:
+http://www.cs.technion.ac.il/~erangi/TMA_using_Linux_perf__Ahmad_Yasin.pdf
+The performance group was originally published here:
+http://perf.mvermeulen.com/2018/04/14/top-down-performance-counter-analysis-part-1-likwid/
diff --git a/collectors/likwid/groups/CLX/UOPS_EXEC.txt b/collectors/likwid/groups/CLX/UOPS_EXEC.txt
new file mode 100644
index 0000000..7042df7
--- /dev/null
+++ b/collectors/likwid/groups/CLX/UOPS_EXEC.txt
@@ -0,0 +1,31 @@
+SHORT UOPs execution
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_EXECUTED_USED_CYCLES
+PMC1  UOPS_EXECUTED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_EXECUTED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the execution stage in the pipeline. Used cycles are all cycles where uOPs are
+executed while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/collectors/likwid/groups/CLX/UOPS_ISSUE.txt b/collectors/likwid/groups/CLX/UOPS_ISSUE.txt
new file mode 100644
index 0000000..9aac923
--- /dev/null
+++ b/collectors/likwid/groups/CLX/UOPS_ISSUE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs issueing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_USED_CYCLES
+PMC1  UOPS_ISSUED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_ISSUED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the issue stage in the pipeline. Used cycles are all cycles where uOPs are
+issued while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/collectors/likwid/groups/CLX/UOPS_RETIRE.txt b/collectors/likwid/groups/CLX/UOPS_RETIRE.txt
new file mode 100644
index 0000000..0f37585
--- /dev/null
+++ b/collectors/likwid/groups/CLX/UOPS_RETIRE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs retirement
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_RETIRED_USED_CYCLES
+PMC1  UOPS_RETIRED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_RETIRED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the retirement stage in the pipeline (re-order buffer). Used cycles are all
+cycles where uOPs are retired while unused cycles refer to pipeline stalls.
+Moreover, the group calculates the average stall duration in cycles.
diff --git a/collectors/likwid/groups/CLX/UPI.txt b/collectors/likwid/groups/CLX/UPI.txt
new file mode 100644
index 0000000..2a4c44f
--- /dev/null
+++ b/collectors/likwid/groups/CLX/UPI.txt
@@ -0,0 +1,42 @@
+SHORT UPI data traffic
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+SBOX0C0 TXL_FLITS_ALL_DATA
+SBOX0C1 RXL_FLITS_ALL_DATA
+SBOX1C0 TXL_FLITS_ALL_DATA
+SBOX1C1 RXL_FLITS_ALL_DATA
+SBOX2C0 TXL_FLITS_ALL_DATA
+SBOX2C1 RXL_FLITS_ALL_DATA
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Received data bandwidth [MByte/s] 1.0E-06*((SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0/time
+Received data volume [GByte] 1.0E-09*((SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0
+Sent data bandwidth [MByte/s] 1.0E-06*((SBOX0C0+SBOX1C0+SBOX2C0)/9.0)*64.0/time
+Sent data volume [GByte] 1.0E-09*((SBOX0C0+SBOX1C0+SBOX2C0)/9.0)*64.0
+Total data bandwidth [MByte/s] 1.0E-06*((SBOX0C0+SBOX1C0+SBOX2C0+SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0/time
+Total data volume [GByte] 1.0E-09*((SBOX0C0+SBOX1C0+SBOX2C0+SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0
+
+
+LONG
+Formulas:
+Received data bandwidth [MByte/s] = 1.0E-06*(SUM(RXL_FLITS_ALL_DATA)/9.0)*64.0/runtime
+Received data volume [GByte] = 1.0E-09*(SUM(RXL_FLITS_ALL_DATA)/9.0)*64.0
+Sent data bandwidth [MByte/s] = 1.0E-06*(SUM(TXL_FLITS_ALL_DATA)/9.0)*64.0/time
+Sent data volume [GByte] = 1.0E-09*(SUM(TXL_FLITS_ALL_DATA)/9.0)*64.0
+Total data bandwidth [MByte/s] = 1.0E-06*((SUM(RXL_FLITS_ALL_DATA)+SUM(TXL_FLITS_ALL_DATA))/9.0)*64.0/time
+Total data volume [GByte] = 1.0E-09*((SUM(RXL_FLITS_ALL_DATA)+SUM(TXL_FLITS_ALL_DATA))/9.0)*64.0
+--
+This group measures the data traffic on the UPI (socket interconnect). The group
+measures all filled data slots (9 slots per 64 Byte data transfer), that's why
+the count needs to be divided by 9. These 9 data chunks are not transferred in
+a single flit but there is one flit for the header and three flits for the data.
+The metrics show higher values as expected because the events count also
+different transfers which include data.
diff --git a/collectors/likwid/groups/ICL/BRANCH.txt b/collectors/likwid/groups/ICL/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/collectors/likwid/groups/ICL/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/ICL/DATA.txt b/collectors/likwid/groups/ICL/DATA.txt
new file mode 100644
index 0000000..4e6e938
--- /dev/null
+++ b/collectors/likwid/groups/ICL/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_INST_RETIRED_ALL_LOADS
+PMC1  MEM_INST_RETIRED_ALL_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_INST_RETIRED_ALL_LOADS/MEM_INST_RETIRED_ALL_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/ICL/DIVIDE.txt b/collectors/likwid/groups/ICL/DIVIDE.txt
new file mode 100644
index 0000000..40b4ab6
--- /dev/null
+++ b/collectors/likwid/groups/ICL/DIVIDE.txt
@@ -0,0 +1,24 @@
+SHORT Divide unit information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ARITH_DIVIDER_COUNT
+PMC1  ARITH_DIVIDER_ACTIVE
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Number of divide ops PMC0
+Avg. divide unit usage duration PMC1/PMC0
+
+LONG
+Formulas:
+Number of divide ops = ARITH_DIVIDER_COUNT
+Avg. divide unit usage duration = ARITH_DIVIDER_ACTIVE/ARITH_DIVIDER_COUNT
+-
+This performance group measures the average latency of divide operations
diff --git a/collectors/likwid/groups/ICL/ENERGY.txt b/collectors/likwid/groups/ICL/ENERGY.txt
new file mode 100644
index 0000000..fe7829f
--- /dev/null
+++ b/collectors/likwid/groups/ICL/ENERGY.txt
@@ -0,0 +1,35 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formulas:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket)  and DRAM level.
+
diff --git a/collectors/likwid/groups/ICL/FLOPS_AVX.txt b/collectors/likwid/groups/ICL/FLOPS_AVX.txt
new file mode 100644
index 0000000..e44a913
--- /dev/null
+++ b/collectors/likwid/groups/ICL/FLOPS_AVX.txt
@@ -0,0 +1,25 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
+PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0+PMC2*16.0)/time
+Packed DP [MFLOP/s]  1.0E-06*(PMC1*4.0+PMC3*8.0)/time
+
+LONG
+Formulas:
+Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+-
+Packed 32b AVX FLOPs rates.
diff --git a/collectors/likwid/groups/ICL/FLOPS_DP.txt b/collectors/likwid/groups/ICL/FLOPS_DP.txt
new file mode 100644
index 0000000..177cff2
--- /dev/null
+++ b/collectors/likwid/groups/ICL/FLOPS_DP.txt
@@ -0,0 +1,34 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time
+AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time
+AVX512 DP [MFLOP/s]  1.0E-06*(PMC3*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+AVX512 DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)
+-
+SSE scalar and packed double precision FLOP rates.
+
diff --git a/collectors/likwid/groups/ICL/FLOPS_SP.txt b/collectors/likwid/groups/ICL/FLOPS_SP.txt
new file mode 100644
index 0000000..01d98c2
--- /dev/null
+++ b/collectors/likwid/groups/ICL/FLOPS_SP.txt
@@ -0,0 +1,34 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time
+AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time
+AVX512 SP [MFLOP/s]  1.0E-06*(PMC3*16.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
+
+LONG
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+AVX512 SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)
+-
+SSE scalar and packed single precision FLOP rates.
+
diff --git a/collectors/likwid/groups/ICX/BRANCH.txt b/collectors/likwid/groups/ICX/BRANCH.txt
new file mode 100644
index 0000000..3eea828
--- /dev/null
+++ b/collectors/likwid/groups/ICX/BRANCH.txt
@@ -0,0 +1,32 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+FIXC3 TOPDOWN_SLOTS
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/ICX/DATA.txt b/collectors/likwid/groups/ICX/DATA.txt
new file mode 100644
index 0000000..ee15427
--- /dev/null
+++ b/collectors/likwid/groups/ICX/DATA.txt
@@ -0,0 +1,23 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+FIXC3 TOPDOWN_SLOTS
+PMC0  MEM_INST_RETIRED_ALL_LOADS
+PMC1  MEM_INST_RETIRED_ALL_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_INST_RETIRED_ALL_LOADS/MEM_INST_RETIRED_ALL_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/ICX/DIVIDE.txt b/collectors/likwid/groups/ICX/DIVIDE.txt
new file mode 100644
index 0000000..5e3be16
--- /dev/null
+++ b/collectors/likwid/groups/ICX/DIVIDE.txt
@@ -0,0 +1,25 @@
+SHORT Divide unit information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+FIXC3 TOPDOWN_SLOTS
+PMC0  ARITH_DIVIDER_COUNT
+PMC1  ARITH_DIVIDER_ACTIVE
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Number of divide ops PMC0
+Avg. divide unit usage duration PMC1/PMC0
+
+LONG
+Formulas:
+Number of divide ops = ARITH_DIVIDER_COUNT
+Avg. divide unit usage duration = ARITH_DIVIDER_ACTIVE/ARITH_DIVIDER_COUNT
+-
+This performance group measures the average latency of divide operations
diff --git a/collectors/likwid/groups/ICX/FLOPS_AVX.txt b/collectors/likwid/groups/ICX/FLOPS_AVX.txt
new file mode 100644
index 0000000..0f41891
--- /dev/null
+++ b/collectors/likwid/groups/ICX/FLOPS_AVX.txt
@@ -0,0 +1,26 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+FIXC3 TOPDOWN_SLOTS
+PMC0  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
+PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0+PMC2*16.0)/time
+Packed DP [MFLOP/s]  1.0E-06*(PMC1*4.0+PMC3*8.0)/time
+
+LONG
+Formulas:
+Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+-
+Packed 32b AVX FLOPs rates.
diff --git a/collectors/likwid/groups/ICX/FLOPS_DP.txt b/collectors/likwid/groups/ICX/FLOPS_DP.txt
new file mode 100644
index 0000000..64e7d3d
--- /dev/null
+++ b/collectors/likwid/groups/ICX/FLOPS_DP.txt
@@ -0,0 +1,35 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+FIXC3 TOPDOWN_SLOTS
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time
+AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time
+AVX512 DP [MFLOP/s]  1.0E-06*(PMC3*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+AVX512 DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)
+-
+SSE scalar and packed double precision FLOP rates.
+
diff --git a/collectors/likwid/groups/ICX/FLOPS_SP.txt b/collectors/likwid/groups/ICX/FLOPS_SP.txt
new file mode 100644
index 0000000..3e6780b
--- /dev/null
+++ b/collectors/likwid/groups/ICX/FLOPS_SP.txt
@@ -0,0 +1,35 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+FIXC3 TOPDOWN_SLOTS
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time
+AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time
+AVX512 SP [MFLOP/s]  1.0E-06*(PMC3*16.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
+
+LONG
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+AVX512 SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)
+-
+SSE scalar and packed single precision FLOP rates.
+
diff --git a/collectors/likwid/groups/ICX/L2.txt b/collectors/likwid/groups/ICX/L2.txt
new file mode 100644
index 0000000..efb6a1f
--- /dev/null
+++ b/collectors/likwid/groups/ICX/L2.txt
@@ -0,0 +1,39 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+FIXC3 TOPDOWN_SLOTS
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  ICACHE_64B_IFTAG_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_64B_IFTAG_MISS)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_64B_IFTAG_MISS)*64
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L1 and the number of modified cache lines
+evicted from the L1. The group also output total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
+
diff --git a/collectors/likwid/groups/TGL/BRANCH.txt b/collectors/likwid/groups/TGL/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/collectors/likwid/groups/TGL/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/TGL/DATA.txt b/collectors/likwid/groups/TGL/DATA.txt
new file mode 100644
index 0000000..4e6e938
--- /dev/null
+++ b/collectors/likwid/groups/TGL/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_INST_RETIRED_ALL_LOADS
+PMC1  MEM_INST_RETIRED_ALL_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_INST_RETIRED_ALL_LOADS/MEM_INST_RETIRED_ALL_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/TGL/DIVIDE.txt b/collectors/likwid/groups/TGL/DIVIDE.txt
new file mode 100644
index 0000000..40b4ab6
--- /dev/null
+++ b/collectors/likwid/groups/TGL/DIVIDE.txt
@@ -0,0 +1,24 @@
+SHORT Divide unit information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ARITH_DIVIDER_COUNT
+PMC1  ARITH_DIVIDER_ACTIVE
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Number of divide ops PMC0
+Avg. divide unit usage duration PMC1/PMC0
+
+LONG
+Formulas:
+Number of divide ops = ARITH_DIVIDER_COUNT
+Avg. divide unit usage duration = ARITH_DIVIDER_ACTIVE/ARITH_DIVIDER_COUNT
+-
+This performance group measures the average latency of divide operations
diff --git a/collectors/likwid/groups/TGL/ENERGY.txt b/collectors/likwid/groups/TGL/ENERGY.txt
new file mode 100644
index 0000000..fe7829f
--- /dev/null
+++ b/collectors/likwid/groups/TGL/ENERGY.txt
@@ -0,0 +1,35 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formulas:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket)  and DRAM level.
+
diff --git a/collectors/likwid/groups/TGL/FLOPS_AVX.txt b/collectors/likwid/groups/TGL/FLOPS_AVX.txt
new file mode 100644
index 0000000..e44a913
--- /dev/null
+++ b/collectors/likwid/groups/TGL/FLOPS_AVX.txt
@@ -0,0 +1,25 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
+PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0+PMC2*16.0)/time
+Packed DP [MFLOP/s]  1.0E-06*(PMC1*4.0+PMC3*8.0)/time
+
+LONG
+Formulas:
+Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+-
+Packed 32b AVX FLOPs rates.
diff --git a/collectors/likwid/groups/TGL/FLOPS_DP.txt b/collectors/likwid/groups/TGL/FLOPS_DP.txt
new file mode 100644
index 0000000..177cff2
--- /dev/null
+++ b/collectors/likwid/groups/TGL/FLOPS_DP.txt
@@ -0,0 +1,34 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time
+AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time
+AVX512 DP [MFLOP/s]  1.0E-06*(PMC3*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+AVX512 DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)
+-
+SSE scalar and packed double precision FLOP rates.
+
diff --git a/collectors/likwid/groups/TGL/FLOPS_SP.txt b/collectors/likwid/groups/TGL/FLOPS_SP.txt
new file mode 100644
index 0000000..01d98c2
--- /dev/null
+++ b/collectors/likwid/groups/TGL/FLOPS_SP.txt
@@ -0,0 +1,34 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time
+AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time
+AVX512 SP [MFLOP/s]  1.0E-06*(PMC3*16.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
+
+LONG
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+AVX512 SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)
+-
+SSE scalar and packed single precision FLOP rates.
+
diff --git a/collectors/likwid/groups/arm64fx/BRANCH.txt b/collectors/likwid/groups/arm64fx/BRANCH.txt
new file mode 100644
index 0000000..dda12fb
--- /dev/null
+++ b/collectors/likwid/groups/arm64fx/BRANCH.txt
@@ -0,0 +1,30 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  BR_PRED
+PMC3  BR_MIS_PRED
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC1/PMC0
+Branch rate   PMC2/PMC0
+Branch misprediction rate  PMC3/PMC0
+Branch misprediction ratio  PMC3/(PMC2+PMC3)
+Instructions per branch  PMC0/(PMC2+PMC3)
+
+LONG
+Formulas:
+CPI = CPU_CYCLES/INST_RETIRED
+Branch rate = BR_PRED/INST_RETIRED
+Branch misprediction rate =  BR_MIS_PRED/INST_RETIRED
+Branch misprediction ratio = BR_MIS_PRED/(BR_PRED+BR_MIS_PRED)
+Instructions per branch = INSTR_RETIRED_ANY/(BR_PRED+BR_MIS_PRED)
+-
+The rates state how often in average a branch or a mispredicted branch occured
+per instruction retired in total. The Branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/Branch rate.
+
diff --git a/collectors/likwid/groups/arm64fx/DATA.txt b/collectors/likwid/groups/arm64fx/DATA.txt
new file mode 100644
index 0000000..40f9cb3
--- /dev/null
+++ b/collectors/likwid/groups/arm64fx/DATA.txt
@@ -0,0 +1,24 @@
+SHORT Load to store ratio
+
+EVENTSET
+PMC0  INST_SPEC
+PMC1  CPU_CYCLES
+PMC2  LD_SPEC
+PMC3  ST_SPEC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC1/PMC0
+Load to store ratio PMC2/PMC3
+Load ratio PMC2/PMC0
+Store ratio PMC3/PMC0
+
+LONG
+Formulas:
+CPI = CPU_CYCLES/INST_SPEC
+Load to store ratio = LD_SPEC / ST_SPEC
+Load ratio = LD_SPEC / INST_SPEC
+Store ratio = ST_SPEC / INST_SPEC
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/arm64fx/FLOPS_DP.txt b/collectors/likwid/groups/arm64fx/FLOPS_DP.txt
new file mode 100644
index 0000000..5e8a565
--- /dev/null
+++ b/collectors/likwid/groups/arm64fx/FLOPS_DP.txt
@@ -0,0 +1,26 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC3  FP_DP_FIXED_OPS_SPEC
+PMC4  FP_DP_SCALE_OPS_SPEC
+
+METRICS
+Runtime (RDTSC) [s] time
+Clock [MHz] 1.E-06*PMC1/time
+CPI  PMC1/PMC0
+DP (FP) [MFLOP/s] 1E-06*(PMC3)/time
+DP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC4*128)/128)+PMC3)/time
+DP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC4*256)/128)+PMC3)/time
+DP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC4*512)/128)+PMC3)/time
+
+LONG
+Formulas:
+DP (FP) [MFLOP/s] = 1E-06*FP_DP_FIXED_OPS_SPEC/time
+DP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*128)/128))/time
+DP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*256)/128))/time
+DP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*512)/128))/time
+-
+Double-precision FP rate for scalar and SVE vector operations with different widths. The events for
+the SVE metrics assumes that all vector elements are active.
diff --git a/collectors/likwid/groups/arm64fx/FLOPS_HP.txt b/collectors/likwid/groups/arm64fx/FLOPS_HP.txt
new file mode 100644
index 0000000..4f449a2
--- /dev/null
+++ b/collectors/likwid/groups/arm64fx/FLOPS_HP.txt
@@ -0,0 +1,26 @@
+SHORT Half-Precision MFLOP/s
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC3  FP_HP_FIXED_OPS_SPEC
+PMC4  FP_HP_SCALE_OPS_SPEC
+
+METRICS
+Runtime (RDTSC) [s] time
+Clock [MHz] 1.E-06*PMC1/time
+CPI  PMC1/PMC0
+HP (FP) [MFLOP/s] 1E-06*(PMC3)/time
+HP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC4*128)/128)+PMC3)/time
+HP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC4*256)/128)+PMC3)/time
+HP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC4*512)/128)+PMC3)/time
+
+LONG
+Formulas:
+HP (FP) [MFLOP/s] = 1E-06*FP_HP_FIXED_OPS_SPEC/time
+HP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_SPEC+((FP_HP_SCALE_OPS_SPEC*128)/128))/time
+HP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_SPEC+((FP_HP_SCALE_OPS_SPEC*256)/128))/time
+HP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_SPEC+((FP_HP_SCALE_OPS_SPEC*512)/128))/time
+-
+Half-precision FP rate for scalar and SVE vector operations with different widths. The events for
+the SVE metrics assumes that all vector elements are active.
diff --git a/collectors/likwid/groups/arm64fx/FLOPS_SP.txt b/collectors/likwid/groups/arm64fx/FLOPS_SP.txt
new file mode 100644
index 0000000..d3248eb
--- /dev/null
+++ b/collectors/likwid/groups/arm64fx/FLOPS_SP.txt
@@ -0,0 +1,26 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC3  FP_SP_FIXED_OPS_SPEC
+PMC4  FP_SP_SCALE_OPS_SPEC
+
+METRICS
+Runtime (RDTSC) [s] time
+Clock [MHz] 1.E-06*PMC1/time
+CPI  PMC1/PMC0
+SP (FP) [MFLOP/s] 1E-06*(PMC3)/time
+SP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC4*128)/128)+PMC3)/time
+SP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC4*256)/128)+PMC3)/time
+SP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC4*512)/128)+PMC3)/time
+
+LONG
+Formulas:
+SP (FP) [MFLOP/s] = 1E-06*FP_SP_FIXED_OPS_SPEC/time
+SP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*128)/128))/time
+SP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*256)/128))/time
+SP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*512)/128))/time
+-
+Single-precision FP rate for scalar and SVE vector operations with different widths. The events for
+the SVE metrics assumes that all vector elements are active.
diff --git a/collectors/likwid/groups/arm64fx/FP_PIPE.txt b/collectors/likwid/groups/arm64fx/FP_PIPE.txt
new file mode 100644
index 0000000..2cde7ef
--- /dev/null
+++ b/collectors/likwid/groups/arm64fx/FP_PIPE.txt
@@ -0,0 +1,33 @@
+SHORT Utilization of FP pipelines
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  FLA_VAL
+PMC3  FLA_VAL_PRD_CNT
+PMC4  FLB_VAL
+PMC5  FLB_VAL_PRD_CNT
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC1/PMC0
+FP operation pipeline A busy rate [%] (PMC2/PMC1)*100.0
+FP pipeline A active element rate [%] (PMC3/(PMC2*16))*100.0
+FP operation pipeline B busy rate [%] (PMC4/PMC1)*100.0
+FP pipeline B active element rate [%] (PMC5/(PMC4*16))*100.0
+
+
+LONG
+Formulas:
+CPI = CPU_CYCLES/INST_SPEC
+FP operation pipeline A busy rate [%] = (FLA_VAL/CPU_CYCLES)*100.0
+FP pipeline A active element rate [%] = (FLA_VAL_PRD_CNT/(FLA_VAL*16))*100.0
+FP operation pipeline B busy rate [%] = (FLB_VAL/CPU_CYCLES)*100.0
+FP pipeline B active element rate [%] = (FLB_VAL_PRD_CNT/(FLB_VAL*16))*100.0
+-
+FLx_VAL: This event counts valid cycles of FLx pipeline.
+FLx_VAL_PRD_CNT: This event counts the number of 1's in the predicate bits of
+                 request in FLA pipeline, where it is corrected so that it
+                 becomes 16 when all bits are 1.
+So each predicate mask has 16 slots, so there are 16 slots per cycle in FLA and
+FLB. FLA is partly used by other instructions like SVE stores.
diff --git a/collectors/likwid/groups/arm64fx/ICACHE.txt b/collectors/likwid/groups/arm64fx/ICACHE.txt
new file mode 100644
index 0000000..6a0bbea
--- /dev/null
+++ b/collectors/likwid/groups/arm64fx/ICACHE.txt
@@ -0,0 +1,24 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  L1I_CACHE
+PMC3  L1I_CACHE_REFILL
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC1/PMC0
+L1I request rate PMC2/PMC0
+L1I miss rate PMC3/PMC0
+L1I miss ratio PMC3/PMC2
+
+LONG
+Formulas:
+CPI = CPU_CYCLES/INST_RETIRED
+L1I request rate = L1I_CACHE / INST_RETIRED
+L1I miss rate = L1I_CACHE_REFILL / INST_RETIRED
+L1I miss ratio = L1I_CACHE_REFILL / L1I_CACHE
+-
+This group measures some L1 instruction cache metrics.
diff --git a/collectors/likwid/groups/arm64fx/L2.txt b/collectors/likwid/groups/arm64fx/L2.txt
new file mode 100644
index 0000000..be47585
--- /dev/null
+++ b/collectors/likwid/groups/arm64fx/L2.txt
@@ -0,0 +1,40 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  L1D_CACHE_REFILL
+PMC3  L1D_CACHE_WB
+PMC4  L1I_CACHE_REFILL
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC1/PMC0
+L1D<-L2 load bandwidth [MBytes/s]  1.0E-06*(PMC2)*256.0/time
+L1D<-L2 load data volume [GBytes]  1.0E-09*(PMC2)*256.0
+L1D->L2 evict bandwidth [MBytes/s]  1.0E-06*PMC3*256.0/time
+L1D->L2 evict data volume [GBytes]  1.0E-09*PMC3*256.0
+L1I<-L2 load bandwidth [MBytes/s]  1.0E-06*PMC4*256.0/time
+L1I<-L2 load data volume [GBytes]  1.0E-09*PMC4*256.0
+L1<->L2 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3+PMC4)*256.0/time
+L1<->L2 data volume [GBytes] 1.0E-09*(PMC2+PMC3+PMC4)*256.0
+
+LONG
+Formulas:
+CPI = CPU_CYCLES/INST_RETIRED
+L1D<-L2 load bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_REFILL*256.0/time
+L1D<-L2 load data volume [GBytes] = 1.0E-09*L1D_CACHE_REFILL*256.0
+L1D->L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_WB*256.0/time
+L1D->L2 evict data volume [GBytes] = 1.0E-09*L1D_CACHE_WB*256.0
+L1I<-L2 load bandwidth [MBytes/s] = 1.0E-06*L1I_CACHE_REFILL*256.0/time
+L1I<-L2 load data volume [GBytes] = 1.0E-09*L1I_CACHE_REFILL*256.0
+L1<->L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*256.0/time
+L1<->L2 data volume [GBytes] = 1.0E-09*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*256.0
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cacheline loaded from the L2 to the L1 data cache and the writebacks from
+the L1 data cache to the L2 cache. The group also outputs total data volume transfered between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and cachelines transfered in the L1 instruction
+cache.
diff --git a/collectors/likwid/groups/arm64fx/MEM.txt b/collectors/likwid/groups/arm64fx/MEM.txt
new file mode 100644
index 0000000..b192b8b
--- /dev/null
+++ b/collectors/likwid/groups/arm64fx/MEM.txt
@@ -0,0 +1,29 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  BUS_READ_TOTAL_MEM
+PMC3  BUS_WRITE_TOTAL_MEM
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC1/PMC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time
+Memory read data volume [GBytes] 1.0E-09*(PMC2)*256.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time
+Memory write data volume [GBytes] 1.0E-09*(PMC3)*256.0
+Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time
+Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM)*256.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM)*256.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_MEM)*256.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_MEM)*256.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0
+-
+Profiling group to measure memory bandwidth. The cache line size is 256 Byte.
diff --git a/collectors/likwid/groups/arm64fx/MEM_DP.txt b/collectors/likwid/groups/arm64fx/MEM_DP.txt
new file mode 100644
index 0000000..96506ff
--- /dev/null
+++ b/collectors/likwid/groups/arm64fx/MEM_DP.txt
@@ -0,0 +1,50 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  BUS_READ_TOTAL_MEM
+PMC3  BUS_WRITE_TOTAL_MEM
+PMC4  FP_DP_FIXED_OPS_SPEC
+PMC5  FP_DP_SCALE_OPS_SPEC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC1/PMC0
+DP (FP) [MFLOP/s] 1E-06*(PMC4)/time
+DP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC5*128.0)/128.0)+PMC4)/time
+DP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC5*256.0)/128.0)+PMC4)/time
+DP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC5*512.0)/128.0)+PMC4)/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time
+Memory read data volume [GBytes] 1.0E-09*(PMC2)*256.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time
+Memory write data volume [GBytes] 1.0E-09*(PMC3)*256.0
+Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time
+Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0
+Operational intensity (FP) PMC4/((PMC2+PMC3)*256.0)
+Operational intensity (FP+SVE128) (((PMC5*128.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0)
+Operational intensity (FP+SVE256) (((PMC5*256.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0)
+Operational intensity (FP+SVE512) (((PMC5*512.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0)
+
+
+LONG
+Formulas:
+DP (FP) [MFLOP/s] = 1E-06*FP_DP_FIXED_OPS_SPEC/time
+DP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*128)/128))/time
+DP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*256)/128))/time
+DP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*512)/128))/time
+Memory read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM)*256.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM)*256.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_MEM)*256.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_MEM)*256.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0
+Operational intensity (FP) = FP_DP_FIXED_OPS_SPEC/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
+Operational intensity (FP+SVE128) = (FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*128)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
+Operational intensity (FP+SVE256) = (FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*256)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
+Operational intensity (FP+SVE512) = (FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*512)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
+-
+Profiling group to measure memory bandwidth and double-precision FP rate for scalar and SVE vector
+operations with different widths. The events for the SVE metrics assumes that all vector elements
+are active. The cache line size is 256 Byte.
diff --git a/collectors/likwid/groups/arm64fx/MEM_HP.txt b/collectors/likwid/groups/arm64fx/MEM_HP.txt
new file mode 100644
index 0000000..17d86e9
--- /dev/null
+++ b/collectors/likwid/groups/arm64fx/MEM_HP.txt
@@ -0,0 +1,50 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  BUS_READ_TOTAL_MEM
+PMC3  BUS_WRITE_TOTAL_MEM
+PMC4  FP_HP_FIXED_OPS_HPEC
+PMC5  FP_HP_SCALE_OPS_HPEC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC1/PMC0
+HP (FP) [MFLOP/s] 1E-06*(PMC4)/time
+HP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC5*128.0)/128.0)+PMC4)/time
+HP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC5*256.0)/128.0)+PMC4)/time
+HP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC5*512.0)/128.0)+PMC4)/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time
+Memory read data volume [GBytes] 1.0E-09*(PMC2)*256.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time
+Memory write data volume [GBytes] 1.0E-09*(PMC3)*256.0
+Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time
+Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0
+Operational intensity (FP) PMC4/((PMC2+PMC3)*256.0)
+Operational intensity (FP+SVE128) (((PMC5*128.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0)
+Operational intensity (FP+SVE256) (((PMC5*256.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0)
+Operational intensity (FP+SVE512) (((PMC5*512.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0)
+
+
+LONG
+Formulas:
+HP (FP) [MFLOP/s] = 1E-06*FP_HP_FIXED_OPS_HPEC/time
+HP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*128)/128))/time
+HP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*256)/128))/time
+HP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*512)/128))/time
+Memory read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM)*256.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM)*256.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_MEM)*256.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_MEM)*256.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0
+Operational intensity (FP) = FP_HP_FIXED_OPS_HPEC/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
+Operational intensity (FP+SVE128) = (FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*128)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
+Operational intensity (FP+SVE256) = (FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*256)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
+Operational intensity (FP+SVE512) = (FP_HP_FIXED_OPS_HPEC+((FP_HP_SCALE_OPS_HPEC*512)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
+-
+Profiling group to measure memory bandwidth and half-precision FP rate for scalar and SVE vector
+operations with different widths. The events for the SVE metrics assumes that all vector elements
+are active. The cache line size is 256 Byte.
diff --git a/collectors/likwid/groups/arm64fx/MEM_SP.txt b/collectors/likwid/groups/arm64fx/MEM_SP.txt
new file mode 100644
index 0000000..b6220b0
--- /dev/null
+++ b/collectors/likwid/groups/arm64fx/MEM_SP.txt
@@ -0,0 +1,50 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  BUS_READ_TOTAL_MEM
+PMC3  BUS_WRITE_TOTAL_MEM
+PMC4  FP_SP_FIXED_OPS_SPEC
+PMC5  FP_SP_SCALE_OPS_SPEC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC1/PMC0
+SP (FP) [MFLOP/s] 1E-06*(PMC4)/time
+SP (FP+SVE128) [MFLOP/s] 1E-06*(((PMC5*128.0)/128.0)+PMC4)/time
+SP (FP+SVE256) [MFLOP/s] 1E-06*(((PMC5*256.0)/128.0)+PMC4)/time
+SP (FP+SVE512) [MFLOP/s] 1E-06*(((PMC5*512.0)/128.0)+PMC4)/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time
+Memory read data volume [GBytes] 1.0E-09*(PMC2)*256.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time
+Memory write data volume [GBytes] 1.0E-09*(PMC3)*256.0
+Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time
+Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0
+Operational intensity (FP) PMC4/((PMC2+PMC3)*256.0)
+Operational intensity (FP+SVE128) (((PMC5*128.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0)
+Operational intensity (FP+SVE256) (((PMC5*256.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0)
+Operational intensity (FP+SVE512) (((PMC5*512.0)/128.0)+PMC4)/((PMC2+PMC3)*256.0)
+
+
+LONG
+Formulas:
+SP (FP) [MFLOP/s] = 1E-06*FP_SP_FIXED_OPS_SPEC/time
+SP (FP+SVE128) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*128)/128))/time
+SP (FP+SVE256) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*256)/128))/time
+SP (FP+SVE512) [MFLOP/s] = 1.0E-06*(FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*512)/128))/time
+Memory read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM)*256.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM)*256.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_MEM)*256.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_MEM)*256.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0
+Operational intensity (FP) = FP_SP_FIXED_OPS_SPEC/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
+Operational intensity (FP+SVE128) = (FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*128)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
+Operational intensity (FP+SVE256) = (FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*256)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
+Operational intensity (FP+SVE512) = (FP_SP_FIXED_OPS_SPEC+((FP_SP_SCALE_OPS_SPEC*512)/128)/((BUS_READ_TOTAL_MEM+BUS_WRITE_TOTAL_MEM)*256.0)
+-
+Profiling group to measure memory bandwidth and single-precision FP rate for scalar and SVE vector
+operations with different widths. The events for the SVE metrics assumes that all vector elements
+are active. The cache line size is 256 Byte.
diff --git a/collectors/likwid/groups/arm64fx/PCI.txt b/collectors/likwid/groups/arm64fx/PCI.txt
new file mode 100644
index 0000000..bca76a6
--- /dev/null
+++ b/collectors/likwid/groups/arm64fx/PCI.txt
@@ -0,0 +1,29 @@
+SHORT PCI bandwidth in MBytes/s
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  BUS_READ_TOTAL_PCI
+PMC3  BUS_WRITE_TOTAL_PCI
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC1/PMC0
+PCI read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time
+PCI read data volume [GBytes] 1.0E-09*(PMC2)*256.0
+PCI write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time
+PCI write data volume [GBytes] 1.0E-09*(PMC3)*256.0
+PCI bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time
+PCI data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0
+
+LONG
+Formulas:
+PCI read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_PCI)*256.0/runtime
+PCI read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_PCI)*256.0
+PCI write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_PCI)*256.0/runtime
+PCI write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_PCI)*256.0
+PCI bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_PCI+BUS_WRITE_TOTAL_PCI)*256.0/runtime
+PCI data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_PCI+BUS_WRITE_TOTAL_PCI)*256.0
+-
+Profiling group to measure PCI bandwidth. The cache line size is 256 Byte.
diff --git a/collectors/likwid/groups/arm64fx/TOFU.txt b/collectors/likwid/groups/arm64fx/TOFU.txt
new file mode 100644
index 0000000..2bebe3e
--- /dev/null
+++ b/collectors/likwid/groups/arm64fx/TOFU.txt
@@ -0,0 +1,29 @@
+SHORT TOFU bandwidth in MBytes/s
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  BUS_READ_TOTAL_TOFU
+PMC3  BUS_WRITE_TOTAL_TOFU
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC1/PMC0
+TOFU read bandwidth [MBytes/s] 1.0E-06*(PMC2)*256.0/time
+TOFU read data volume [GBytes] 1.0E-09*(PMC2)*256.0
+TOFU write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time
+TOFU write data volume [GBytes] 1.0E-09*(PMC3)*256.0
+TOFU bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time
+TOFU data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0
+
+LONG
+Formulas:
+TOFU read bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_TOFU)*256.0/runtime
+TOFU read data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_TOFU)*256.0
+TOFU write bandwidth [MBytes/s] = 1.0E-06*(BUS_WRITE_TOTAL_TOFU)*256.0/runtime
+TOFU write data volume [GBytes] = 1.0E-09*(BUS_WRITE_TOTAL_TOFU)*256.0
+TOFU bandwidth [MBytes/s] = 1.0E-06*(BUS_READ_TOTAL_TOFU+BUS_WRITE_TOTAL_TOFU)*256.0/runtime
+TOFU data volume [GBytes] = 1.0E-09*(BUS_READ_TOTAL_TOFU+BUS_WRITE_TOTAL_TOFU)*256.0
+-
+Profiling group to measure TOFU bandwidth. The cache line size is 256 Byte.
diff --git a/collectors/likwid/groups/arm8/BRANCH.txt b/collectors/likwid/groups/arm8/BRANCH.txt
new file mode 100644
index 0000000..8cd4f00
--- /dev/null
+++ b/collectors/likwid/groups/arm8/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  BR_PRED
+PMC3  BR_MIS_PRED
+PMC4  INST_SPEC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC1/PMC0
+Branch rate   PMC2/PMC0
+Branch misprediction rate  PMC3/PMC0
+Branch misprediction ratio  PMC3/(PMC2+PMC3)
+Instructions per branch  PMC0/(PMC2+PMC3)
+
+LONG
+Formulas:
+CPI = CPU_CYCLES/INST_RETIRED
+Branch rate = BR_PRED/INST_RETIRED
+Branch misprediction rate =  BR_MIS_PRED/INST_RETIRED
+Branch misprediction ratio = BR_MIS_PRED/(BR_PRED+BR_MIS_PRED)
+Instructions per branch = INSTR_RETIRED_ANY/(BR_PRED+BR_MIS_PRED)
+-
+The rates state how often in average a branch or a mispredicted branch occured
+per instruction retired in total. The Branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/Branch rate.
+
diff --git a/collectors/likwid/groups/arm8/DATA.txt b/collectors/likwid/groups/arm8/DATA.txt
new file mode 100644
index 0000000..4338d90
--- /dev/null
+++ b/collectors/likwid/groups/arm8/DATA.txt
@@ -0,0 +1,24 @@
+SHORT Load to store ratio
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  LD_RETIRED
+PMC3  ST_RETIRED
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC1/PMC0
+Load to store ratio PMC2/PMC3
+Load ratio PMC2/PMC0
+Store ratio PMC3/PMC0
+
+LONG
+Formulas:
+CPI = CPU_CYCLES/INST_RETIRED
+Load to store ratio = LD_RETIRED / ST_RETIRED
+Load ratio = LD_RETIRED / INST_RETIRED
+Store ratio = ST_RETIRED / INST_RETIRED
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/arm8/ICACHE.txt b/collectors/likwid/groups/arm8/ICACHE.txt
new file mode 100644
index 0000000..6a0bbea
--- /dev/null
+++ b/collectors/likwid/groups/arm8/ICACHE.txt
@@ -0,0 +1,24 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  L1I_CACHE
+PMC3  L1I_CACHE_REFILL
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC1/PMC0
+L1I request rate PMC2/PMC0
+L1I miss rate PMC3/PMC0
+L1I miss ratio PMC3/PMC2
+
+LONG
+Formulas:
+CPI = CPU_CYCLES/INST_RETIRED
+L1I request rate = L1I_CACHE / INST_RETIRED
+L1I miss rate = L1I_CACHE_REFILL / INST_RETIRED
+L1I miss ratio = L1I_CACHE_REFILL / L1I_CACHE
+-
+This group measures some L1 instruction cache metrics.
diff --git a/collectors/likwid/groups/arm8/L2.txt b/collectors/likwid/groups/arm8/L2.txt
new file mode 100644
index 0000000..9f0c2e4
--- /dev/null
+++ b/collectors/likwid/groups/arm8/L2.txt
@@ -0,0 +1,40 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  L1D_CACHE_REFILL
+PMC3  L1D_CACHE_WB
+PMC4  L1I_CACHE_REFILL
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC1/PMC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC2*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC2*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC3*64.0
+L2I load bandwidth [MBytes/s]  1.0E-06*PMC4*64.0/time
+L2I load data volume [GBytes]  1.0E-09*PMC4*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3+PMC4)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC2+PMC3+PMC4)*64.0
+
+LONG
+Formulas:
+CPI = CPU_CYCLES/INST_RETIRED
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_REFILL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_CACHE_REFILL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_WB*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_CACHE_WB*64.0
+L2I load bandwidth [MBytes/s] = 1.0E-06*L1I_CACHE_REFILL*64.0/time
+L2I load data volume [GBytes] = 1.0E-09*L1I_CACHE_REFILL*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cacheline loaded from the L2 to the L1 data cache and the writebacks from
+the L1 data cache to the L2 cache. The group also outputs total data volume transfered between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and cachelines transfered it the instruction
+cache.
diff --git a/collectors/likwid/groups/arm8/MEM.txt b/collectors/likwid/groups/arm8/MEM.txt
new file mode 100644
index 0000000..d383916
--- /dev/null
+++ b/collectors/likwid/groups/arm8/MEM.txt
@@ -0,0 +1,30 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  L2D_CACHE_REFILL
+PMC3  L2D_CACHE_WB
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC1/PMC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(PMC2)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(PMC3)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL)*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL)*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_WB)*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(L2D_CACHE_WB)*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL+L2D_CACHE_WB)*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL+L2D_CACHE_WB)*64.0
+-
+Profiling group to measure memory bandwidth as initiated by the L2 cache.
+
diff --git a/collectors/likwid/groups/arm8_n1/BRANCH.txt b/collectors/likwid/groups/arm8_n1/BRANCH.txt
new file mode 100644
index 0000000..8cd4f00
--- /dev/null
+++ b/collectors/likwid/groups/arm8_n1/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  BR_PRED
+PMC3  BR_MIS_PRED
+PMC4  INST_SPEC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC1/PMC0
+Branch rate   PMC2/PMC0
+Branch misprediction rate  PMC3/PMC0
+Branch misprediction ratio  PMC3/(PMC2+PMC3)
+Instructions per branch  PMC0/(PMC2+PMC3)
+
+LONG
+Formulas:
+CPI = CPU_CYCLES/INST_RETIRED
+Branch rate = BR_PRED/INST_RETIRED
+Branch misprediction rate =  BR_MIS_PRED/INST_RETIRED
+Branch misprediction ratio = BR_MIS_PRED/(BR_PRED+BR_MIS_PRED)
+Instructions per branch = INSTR_RETIRED_ANY/(BR_PRED+BR_MIS_PRED)
+-
+The rates state how often in average a branch or a mispredicted branch occured
+per instruction retired in total. The Branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/Branch rate.
+
diff --git a/collectors/likwid/groups/arm8_n1/CLOCK.txt b/collectors/likwid/groups/arm8_n1/CLOCK.txt
new file mode 100644
index 0000000..ad7303a
--- /dev/null
+++ b/collectors/likwid/groups/arm8_n1/CLOCK.txt
@@ -0,0 +1,16 @@
+SHORT Cycles and instructions
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC1/PMC0
+
+LONG
+Formulas:
+CPI = CPU_CYCLES/INST_RETIRED
+-
+This is a metric to determine cycles per instruction.
+
diff --git a/collectors/likwid/groups/arm8_n1/DATA.txt b/collectors/likwid/groups/arm8_n1/DATA.txt
new file mode 100644
index 0000000..d2221a8
--- /dev/null
+++ b/collectors/likwid/groups/arm8_n1/DATA.txt
@@ -0,0 +1,24 @@
+SHORT Load to store ratio
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  LD_SPEC
+PMC3  ST_SPEC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC1/PMC0
+Load to store ratio PMC2/PMC3
+Load ratio PMC2/PMC0
+Store ratio PMC3/PMC0
+
+LONG
+Formulas:
+CPI = CPU_CYCLES/INST_RETIRED
+Load to store ratio = LD_SPEC / ST_SPEC
+Load ratio = LD_SPEC / INST_SPEC
+Store ratio = ST_SPEC / INST_SPEC
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/arm8_n1/ICACHE.txt b/collectors/likwid/groups/arm8_n1/ICACHE.txt
new file mode 100644
index 0000000..6a0bbea
--- /dev/null
+++ b/collectors/likwid/groups/arm8_n1/ICACHE.txt
@@ -0,0 +1,24 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  L1I_CACHE
+PMC3  L1I_CACHE_REFILL
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC1/PMC0
+L1I request rate PMC2/PMC0
+L1I miss rate PMC3/PMC0
+L1I miss ratio PMC3/PMC2
+
+LONG
+Formulas:
+CPI = CPU_CYCLES/INST_RETIRED
+L1I request rate = L1I_CACHE / INST_RETIRED
+L1I miss rate = L1I_CACHE_REFILL / INST_RETIRED
+L1I miss ratio = L1I_CACHE_REFILL / L1I_CACHE
+-
+This group measures some L1 instruction cache metrics.
diff --git a/collectors/likwid/groups/arm8_n1/L2.txt b/collectors/likwid/groups/arm8_n1/L2.txt
new file mode 100644
index 0000000..9f0c2e4
--- /dev/null
+++ b/collectors/likwid/groups/arm8_n1/L2.txt
@@ -0,0 +1,40 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  L1D_CACHE_REFILL
+PMC3  L1D_CACHE_WB
+PMC4  L1I_CACHE_REFILL
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC1/PMC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC2*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC2*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC3*64.0
+L2I load bandwidth [MBytes/s]  1.0E-06*PMC4*64.0/time
+L2I load data volume [GBytes]  1.0E-09*PMC4*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3+PMC4)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC2+PMC3+PMC4)*64.0
+
+LONG
+Formulas:
+CPI = CPU_CYCLES/INST_RETIRED
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_REFILL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_CACHE_REFILL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_WB*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_CACHE_WB*64.0
+L2I load bandwidth [MBytes/s] = 1.0E-06*L1I_CACHE_REFILL*64.0/time
+L2I load data volume [GBytes] = 1.0E-09*L1I_CACHE_REFILL*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cacheline loaded from the L2 to the L1 data cache and the writebacks from
+the L1 data cache to the L2 cache. The group also outputs total data volume transfered between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and cachelines transfered it the instruction
+cache.
diff --git a/collectors/likwid/groups/arm8_n1/L3.txt b/collectors/likwid/groups/arm8_n1/L3.txt
new file mode 100644
index 0000000..3c8a73e
--- /dev/null
+++ b/collectors/likwid/groups/arm8_n1/L3.txt
@@ -0,0 +1,30 @@
+SHORT L3 cache bandwidth in MBytes/s 
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  L2D_CACHE_REFILL
+PMC3  L2D_CACHE_WB
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC1/PMC0
+L3 read bandwidth [MBytes/s] 1.0E-06*(PMC2)*64.0/time
+L3 read data volume [GBytes] 1.0E-09*(PMC2)*64.0
+L3 write bandwidth [MBytes/s] 1.0E-06*(PMC3)*64.0/time
+L3 write data volume [GBytes] 1.0E-09*(PMC3)*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+
+LONG
+Formulas:
+L3 read bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL)*64.0/runtime
+L3 read data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL)*64.0
+L3 write bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_WB)*64.0/runtime
+L3 write data volume [GBytes] = 1.0E-09*(L2D_CACHE_WB)*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL+L2D_CACHE_WB)*64.0/runtime
+L3 data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL+L2D_CACHE_WB)*64.0
+-
+Profiling group to measure traffic between L2 and L3 cache.
+
diff --git a/collectors/likwid/groups/arm8_n1/MEM.txt b/collectors/likwid/groups/arm8_n1/MEM.txt
new file mode 100644
index 0000000..8c334bb
--- /dev/null
+++ b/collectors/likwid/groups/arm8_n1/MEM.txt
@@ -0,0 +1,29 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  MEM_ACCESS_RD
+PMC3  MEM_ACCESS_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC1/PMC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(PMC2)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(PMC2)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(PMC3)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(MEM_ACCESS_RD)*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(MEM_ACCESS_RD)*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(MEM_ACCESS_WR)*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(MEM_ACCESS_WR)*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(MEM_ACCESS_RD+MEM_ACCESS_WR)*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(MEM_ACCESS_RD+MEM_ACCESS_WR)*64.0
+-
+Profiling group to measure memory bandwidth
+
diff --git a/collectors/likwid/groups/arm8_n1/TLB.txt b/collectors/likwid/groups/arm8_n1/TLB.txt
new file mode 100644
index 0000000..4e588b1
--- /dev/null
+++ b/collectors/likwid/groups/arm8_n1/TLB.txt
@@ -0,0 +1,30 @@
+SHORT L1/L2 TLB information 
+
+EVENTSET
+PMC0  L1D_TLB
+PMC1  L1I_TLB
+PMC2  L2D_TLB
+PMC3  L1D_TLB_REFILL
+PMC4  L1I_TLB_REFILL
+PMC5  L2D_TLB_REFILL
+
+METRICS
+Runtime (RDTSC) [s] time
+L1 DTLB accesses PMC0
+L1 ITLB accesses PMC1
+L2 DTLB accesses PMC2
+L1 DTLB refills PMC3
+L1 ITLB refills PMC4
+L2 DTLB refills PMC5
+L1 DTLB refill ratio PMC3/PMC0
+L1 ITLB refill ratio PMC4/PMC1
+L1 DTLB refill ratio PMC5/PMC2
+
+LONG
+Formulas:
+L1 DTLB refill ratio = L1D_TLB_REFILL / L1D_TLB
+L1 ITLB refill ratio = L1I_TLB_REFILL / L1I_TLB
+L2 DTLB refill ratio = L2D_TLB_REFILL / L2D_TLB
+-
+This group gives information about the TLB usage for all TLBs:
+L1 data, L1 instruction and L2 data.
diff --git a/collectors/likwid/groups/arm8_tx2/BRANCH.txt b/collectors/likwid/groups/arm8_tx2/BRANCH.txt
new file mode 100644
index 0000000..db0fa40
--- /dev/null
+++ b/collectors/likwid/groups/arm8_tx2/BRANCH.txt
@@ -0,0 +1,32 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  BR_PRED
+PMC3  BR_MIS_PRED
+PMC4  INST_SPEC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Clock [MHz] 1.E-06*PMC1/time
+CPI  PMC1/PMC0
+Branch rate   PMC2/PMC0
+Branch misprediction rate  PMC3/PMC0
+Branch misprediction ratio  PMC3/(PMC2+PMC3)
+Instructions per branch  PMC0/(PMC2+PMC3)
+
+LONG
+Formulas:
+CPI = CPU_CYCLES/INST_RETIRED
+Branch rate = BR_PRED/INST_RETIRED
+Branch misprediction rate =  BR_MIS_PRED/INST_RETIRED
+Branch misprediction ratio = BR_MIS_PRED/(BR_PRED+BR_MIS_PRED)
+Instructions per branch = INSTR_RETIRED_ANY/(BR_PRED+BR_MIS_PRED)
+-
+The rates state how often in average a branch or a mispredicted branch occured
+per instruction retired in total. The Branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/Branch rate.
+
diff --git a/collectors/likwid/groups/arm8_tx2/DATA.txt b/collectors/likwid/groups/arm8_tx2/DATA.txt
new file mode 100644
index 0000000..09681c2
--- /dev/null
+++ b/collectors/likwid/groups/arm8_tx2/DATA.txt
@@ -0,0 +1,25 @@
+SHORT Load to store ratio
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  LD_RETIRED
+PMC3  ST_RETIRED
+
+METRICS
+Runtime (RDTSC) [s] time
+Clock [MHz] 1.E-06*PMC1/time
+CPI  PMC1/PMC0
+Load to store ratio PMC2/PMC3
+Load ratio PMC2/PMC0
+Store ratio PMC3/PMC0
+
+LONG
+Formulas:
+CPI = CPU_CYCLES/INST_RETIRED
+Load to store ratio = LD_RETIRED / ST_RETIRED
+Load ratio = LD_RETIRED / INST_RETIRED
+Store ratio = ST_RETIRED / INST_RETIRED
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/arm8_tx2/FLOPS_DP.txt b/collectors/likwid/groups/arm8_tx2/FLOPS_DP.txt
new file mode 100644
index 0000000..5b477de
--- /dev/null
+++ b/collectors/likwid/groups/arm8_tx2/FLOPS_DP.txt
@@ -0,0 +1,28 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  VFP_SPEC
+PMC3  ASE_SPEC
+
+METRICS
+Runtime (RDTSC) [s] time
+Clock [MHz] 1.E-06*PMC1/time
+CPI  PMC1/PMC0
+DP [MFLOP/s]  1.0E-06*(PMC3*2.0+PMC2)/time
+NEON DP [MFLOP/s]  1.0E-06*(PMC3*2.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*PMC2/time
+Vectorization ratio 100*(PMC3)/(PMC2+PMC3)
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(ASE_SPEC*2+VFP_SPEC)/runtime
+NEON DP [MFLOP/s] = 1.0E-06*(ASE_SPEC*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(ASE_SPEC)/runtime
+Scalar [MUOPS/s] = 1.0E-06*VFP_SPEC/runtime
+Vectorization ratio = 100*(ASE_SPEC)/(ASE_SPEC+VFP_SPEC)
+-
+NEON scalar and packed double precision FLOP rates.
+
diff --git a/collectors/likwid/groups/arm8_tx2/FLOPS_SP.txt b/collectors/likwid/groups/arm8_tx2/FLOPS_SP.txt
new file mode 100644
index 0000000..9857308
--- /dev/null
+++ b/collectors/likwid/groups/arm8_tx2/FLOPS_SP.txt
@@ -0,0 +1,28 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  VFP_SPEC
+PMC3  ASE_SPEC
+
+METRICS
+Runtime (RDTSC) [s] time
+Clock [MHz] 1.E-06*PMC1/time
+CPI  PMC1/PMC0
+SP [MFLOP/s]  1.0E-06*(PMC3*2.0+PMC2)/time
+NEON SP [MFLOP/s]  1.0E-06*(PMC3*2.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*PMC2/time
+Vectorization ratio 100*(PMC3)/(PMC2+PMC3)
+
+LONG
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(ASE_SPEC*2+VFP_SPEC)/runtime
+NEON SP [MFLOP/s] = 1.0E-06*(ASE_SPEC*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(ASE_SPEC)/runtime
+Scalar [MUOPS/s] = 1.0E-06*VFP_SPEC/runtime
+Vectorization ratio = 100*(ASE_SPEC)/(ASE_SPEC+VFP_SPEC)
+-
+NEON scalar and packed single precision FLOP rates.
+
diff --git a/collectors/likwid/groups/arm8_tx2/ICACHE.txt b/collectors/likwid/groups/arm8_tx2/ICACHE.txt
new file mode 100644
index 0000000..fbaf3be
--- /dev/null
+++ b/collectors/likwid/groups/arm8_tx2/ICACHE.txt
@@ -0,0 +1,23 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  L1I_CACHE
+PMC3  L1I_CACHE_REFILL
+
+METRICS
+Runtime (RDTSC) [s] time
+Clock [MHz] 1.E-06*PMC1/time
+CPI  PMC1/PMC0
+L1I request rate PMC2/PMC0
+L1I miss rate PMC3/PMC0
+L1I miss ratio PMC3/PMC2
+
+LONG
+Formulas:
+L1I request rate = L1I_CACHE / INST_RETIRED
+L1I miss rate = L1I_CACHE_REFILL / INST_RETIRED
+L1I miss ratio = L1I_CACHE_REFILL / L1I_CACHE
+-
+This group measures some L1 instruction cache metrics.
diff --git a/collectors/likwid/groups/arm8_tx2/L2.txt b/collectors/likwid/groups/arm8_tx2/L2.txt
new file mode 100644
index 0000000..53bec4c
--- /dev/null
+++ b/collectors/likwid/groups/arm8_tx2/L2.txt
@@ -0,0 +1,41 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  L1D_CACHE_REFILL
+PMC3  L1D_CACHE_WB
+PMC4  L1I_CACHE_REFILL
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Clock [MHz] 1.E-06*PMC1/time
+CPI  PMC1/PMC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC2*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC2*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC3*64.0
+L2I load bandwidth [MBytes/s]  1.0E-06*PMC4*64.0/time
+L2I load data volume [GBytes]  1.0E-09*PMC4*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3+PMC4)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC2+PMC3+PMC4)*64.0
+
+LONG
+Formulas:
+CPI = CPU_CYCLES/INST_RETIRED
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_REFILL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_CACHE_REFILL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_WB*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_CACHE_WB*64.0
+L2I load bandwidth [MBytes/s] = 1.0E-06*L1I_CACHE_REFILL*64.0/time
+L2I load data volume [GBytes] = 1.0E-09*L1I_CACHE_REFILL*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*64.0
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cacheline loaded from the L2 to the L1 data cache and the writebacks from
+the L1 data cache to the L2 cache. The group also outputs total data volume transfered between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and cachelines transfered it the instruction
+cache.
diff --git a/collectors/likwid/groups/arm8_tx2/L2CACHE.txt b/collectors/likwid/groups/arm8_tx2/L2CACHE.txt
new file mode 100644
index 0000000..4696e28
--- /dev/null
+++ b/collectors/likwid/groups/arm8_tx2/L2CACHE.txt
@@ -0,0 +1,32 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  L2D_CACHE
+PMC3  L2D_CACHE_REFILL
+
+METRICS
+Runtime (RDTSC) [s] time
+Clock [MHz] 1.E-06*PMC1/time
+CPI  PMC1/PMC0
+L2 request rate PMC2/PMC0
+L2 miss rate PMC3/PMC0
+L2 miss ratio PMC3/PMC2
+
+LONG
+Formulas:
+L2 request rate = L2D_CACHE/INST_RETIRED
+L2 miss rate = L2D_CACHE_REFILL/INST_RETIRED
+L2 miss ratio = L2D_CACHE_REFILL/L2D_CACHE
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/arm8_tx2/L3.txt b/collectors/likwid/groups/arm8_tx2/L3.txt
new file mode 100644
index 0000000..4c99a05
--- /dev/null
+++ b/collectors/likwid/groups/arm8_tx2/L3.txt
@@ -0,0 +1,38 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  L2D_CACHE_REFILL
+PMC3  L2D_CACHE_WB
+PMC4  L2D_CACHE_ALLOCATE
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Clock [MHz] 1.E-06*PMC1/time
+CPI  PMC1/PMC0
+L3 load bandwidth [MBytes/s]  1.0E-06*(PMC2-PMC4)*64.0/time
+L3 load data volume [GBytes]  1.0E-09*(PMC2-PMC4)*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC3*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3-PMC4)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3-PMC4)*64.0
+
+LONG
+Formulas:
+CPI = CPU_CYCLES/INST_RETIRED
+L3 load bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL-L2D_CACHE_ALLOCATE)*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL-L2D_CACHE_ALLOCATE)*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2D_CACHE_WB*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2D_CACHE_WB*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL+L2D_CACHE_WB-L2D_CACHE_ALLOCATE))*64.0/time
+L3 data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL+L2D_CACHE_WB-L2D_CACHE_ALLOCATE))*64.0
+-
+Profiling group to measure L2 <-> L3 cache bandwidth. The bandwidth is computed by the
+number of cache lines loaded from the L3 to the L2 data cache and the writebacks from
+the L2 data cache to the L3 cache. The group also outputs total data volume transfered between
+L3 and L2. For streaming-stores, the cache lines are allocated in L2, consequently there
+is no traffic between L3 and L2 in this case. But the L2D_CACHE_REFILL event counts these
+allocated cache lines, that's why the value of L2D_CACHE_REFILL is reduced
+by L2D_CACHE_ALLOCATE.
diff --git a/collectors/likwid/groups/arm8_tx2/MEM.txt b/collectors/likwid/groups/arm8_tx2/MEM.txt
new file mode 100644
index 0000000..06bc697
--- /dev/null
+++ b/collectors/likwid/groups/arm8_tx2/MEM.txt
@@ -0,0 +1,32 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+MBOX0C0  MEMORY_READS
+MBOX0C1  MEMORY_WRITES
+MBOX1C0  MEMORY_READS
+MBOX1C1  MEMORY_WRITES
+
+METRICS
+Runtime (RDTSC) [s] time
+Clock [MHz] 1.E-06*PMC1/time
+CPI  PMC1/PMC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MEMORY_READS))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MEMORY_READS))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MEMORY_WRITES))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MEMORY_WRITES))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MEMORY_READS)+SUM(MEMORY_WRITES))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(MEMORY_READS)+SUM(MEMORY_WRITES))*64.0
+-
+Profiling group to measure memory bandwidth. It uses the performance monitoring
+hardware of the memory controllers.
diff --git a/collectors/likwid/groups/arm8_tx2/SPEC.txt b/collectors/likwid/groups/arm8_tx2/SPEC.txt
new file mode 100644
index 0000000..7561d3a
--- /dev/null
+++ b/collectors/likwid/groups/arm8_tx2/SPEC.txt
@@ -0,0 +1,44 @@
+SHORT Information about speculative execution
+
+EVENTSET
+PMC0 INST_SPEC
+PMC1 LD_SPEC
+PMC2 ST_SPEC
+PMC3 DP_SPEC
+PMC4 VFP_SPEC
+PMC5 ASE_SPEC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Operations spec. executed PMC0
+Load ops spec. executed PMC1
+Store ops spec. executed PMC2
+Integer data ops spec. executed PMC3
+Scalar FP ops spec. executed PMC4
+Vector FP ops spec. executed PMC5
+Other ops spec. executed (PMC0-PMC1-PMC2-PMC3-PMC4-PMC5)
+Load ops spec. ratio PMC1/PMC0
+Store ops spec. ratio PMC2/PMC0
+Integer data ops spec. ratio PMC3/PMC0
+Scalar FP ops spec. ratio PMC4/PMC0
+Vector FP ops spec. ratio PMC5/PMC0
+Other ops spec. ratio (PMC0-PMC1-PMC2-PMC3-PMC4-PMC5)/PMC0
+
+
+
+
+LONG
+Formulas:
+Load ops spec. ratio = LD_SPEC / INST_SPEC
+Store ops spec. ratio = ST_SPEC / INST_SPEC
+Integer data ops spec. ratio = DP_SPEC / INST_SPEC
+Scalar FP ops spec. ratio = VFP_SPEC / INST_SPEC
+Vector FP ops spec. ratio = ASE_SPEC / INST_SPEC
+Other ops spec. ratio = (INST_SPEC-LD_SPEC-ST_SPEC-DP_SPEC-VFP_SPEC-ASE_SPEC) / INST_SPEC
+-
+This group gives information about the speculative execution of micro-ops.
+It is currently unclear why Other ops spec. executed and ratio is negative
+in some cases. Although the documentation contains an OP_RETIRED, there is no
+equivalent OP_SPEC which could be a better reference in this group instead of
+INST_SPEC.
diff --git a/collectors/likwid/groups/arm8_tx2/TLB_DATA.txt b/collectors/likwid/groups/arm8_tx2/TLB_DATA.txt
new file mode 100644
index 0000000..054b0ec
--- /dev/null
+++ b/collectors/likwid/groups/arm8_tx2/TLB_DATA.txt
@@ -0,0 +1,27 @@
+SHORT  L1 data TLB miss rate/ratio
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  L1D_TLB_REFILL_RD
+PMC3  L1D_TLB_REFILL_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Clock [MHz] 1.E-06*PMC1/time
+CPI  PMC1/PMC0
+L1 DTLB load misses     PMC2
+L1 DTLB load miss rate  PMC2/PMC0
+L1 DTLB store misses     PMC3
+L1 DTLB store miss rate  PMC3/PMC0
+
+LONG
+Formulas:
+L1 DTLB load misses = L1D_TLB_REFILL_RD
+L1 DTLB load miss rate = L1D_TLB_REFILL_RD / INST_RETIRED
+L1 DTLB store misses = L1D_TLB_REFILL_WR
+L1 DTLB store miss rate = L1D_TLB_REFILL_WR / INST_RETIRED
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction.
+
diff --git a/collectors/likwid/groups/arm8_tx2/TLB_INSTR.txt b/collectors/likwid/groups/arm8_tx2/TLB_INSTR.txt
new file mode 100644
index 0000000..c1111c8
--- /dev/null
+++ b/collectors/likwid/groups/arm8_tx2/TLB_INSTR.txt
@@ -0,0 +1,23 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  L1I_TLB_REFILL
+
+METRICS
+Runtime (RDTSC) [s] time
+Clock [MHz] 1.E-06*PMC1/time
+CPI  PMC1/PMC0
+L1 ITLB misses     PMC2
+L1 ITLB miss rate  PMC2/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = L1I_TLB_REFILL
+L1 ITLB miss rate = L1I_TLB_REFILL / INST_RETIRED
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction.
+
diff --git a/collectors/likwid/groups/atom/BRANCH.txt b/collectors/likwid/groups/atom/BRANCH.txt
new file mode 100644
index 0000000..7b2bb20
--- /dev/null
+++ b/collectors/likwid/groups/atom/BRANCH.txt
@@ -0,0 +1,29 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+PMC0  BR_INST_RETIRED_ANY
+PMC1  BR_INST_RETIRED_MISPRED
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ANY/INSTR_RETIRED_ANY
+Branch misprediction rate = BR_INST_RETIRED_MISPRED/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_INST_RETIRED_MISPRED/BR_INST_RETIRED_ANY
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ANY
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/atom/DATA.txt b/collectors/likwid/groups/atom/DATA.txt
new file mode 100644
index 0000000..b2d007f
--- /dev/null
+++ b/collectors/likwid/groups/atom/DATA.txt
@@ -0,0 +1,20 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+PMC0  L1D_CACHE_LD
+PMC1  L1D_CACHE_ST
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = L1D_CACHE_LD/L1D_CACHE_ST
+-
+This is a simple metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/atom/FLOPS_DP.txt b/collectors/likwid/groups/atom/FLOPS_DP.txt
new file mode 100644
index 0000000..53b2d02
--- /dev/null
+++ b/collectors/likwid/groups/atom/FLOPS_DP.txt
@@ -0,0 +1,25 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+PMC0  SIMD_COMP_INST_RETIRED_PACKED_DOUBLE
+PMC1  SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+CPI  FIXC1/FIXC0
+DP [MFLOP/s]    1.0E-06*(PMC0*2.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_DOUBLE*2.0+SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE)/runtime
+Packed [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_PACKED_DOUBLE/runtime
+Scalar [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE/runtime
+--
+Double Precision [MFLOP/s] Double Precision MFLOP/s
+
diff --git a/collectors/likwid/groups/atom/FLOPS_SP.txt b/collectors/likwid/groups/atom/FLOPS_SP.txt
new file mode 100644
index 0000000..0046d5b
--- /dev/null
+++ b/collectors/likwid/groups/atom/FLOPS_SP.txt
@@ -0,0 +1,24 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+PMC0  SIMD_COMP_INST_RETIRED_PACKED_SINGLE
+PMC1  SIMD_COMP_INST_RETIRED_SCALAR_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+CPI  FIXC1/FIXC0
+SP [MFLOP/s] (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+
+LONG
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_DOUBLE*4.0+SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE)/runtime
+Packed [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_PACKED_SINGLE/runtime
+Scalar [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_SCALAR_SINGLE/runtime
+--
+Single Precision MFLOP/s Double Precision MFLOP/s
+
diff --git a/collectors/likwid/groups/atom/FLOPS_X87.txt b/collectors/likwid/groups/atom/FLOPS_X87.txt
new file mode 100644
index 0000000..58c5d42
--- /dev/null
+++ b/collectors/likwid/groups/atom/FLOPS_X87.txt
@@ -0,0 +1,19 @@
+SHORT X87 MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+PMC0  X87_COMP_OPS_EXE_ANY_AR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+CPI  FIXC1/FIXC0
+X87 [MFLOP/s]  1.0E-06*PMC0/time
+
+LONG
+Formulas:
+X87 [MFLOP/s] = 1.0E-06*X87_COMP_OPS_EXE_ANY_AR/runtime
+--
+The MFLOP/s made with X87 instructions
+
diff --git a/collectors/likwid/groups/atom/MEM.txt b/collectors/likwid/groups/atom/MEM.txt
new file mode 100644
index 0000000..355b7fd
--- /dev/null
+++ b/collectors/likwid/groups/atom/MEM.txt
@@ -0,0 +1,21 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+PMC0  BUS_TRANS_MEM_THIS_CORE_THIS_A
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+CPI  FIXC1/FIXC0
+Memory bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+Memory data volume [GBytes] 1.0E-09*PMC0*64.0
+
+LONG
+Formulas:
+Memory bandwidth [MBytes/s] = 1.0E-06*BUS_TRANS_MEM_THIS_CORE_THIS_A*64/time
+Memory data volume [GBytes] = 1.0E-09*BUS_TRANS_MEM_THIS_CORE_THIS_A*64.0
+-
+Profiling group to measure memory bandwidth drawn by this core.
+
diff --git a/collectors/likwid/groups/atom/TLB.txt b/collectors/likwid/groups/atom/TLB.txt
new file mode 100644
index 0000000..5d0aa1b
--- /dev/null
+++ b/collectors/likwid/groups/atom/TLB.txt
@@ -0,0 +1,21 @@
+SHORT TLB miss rate
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+PMC0  DATA_TLB_MISSES_DTLB_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+CPI  FIXC1/FIXC0
+DTLB misses       PMC0
+DTLB miss rate    PMC0/FIXC0
+
+LONG
+Formulas:
+DTLB misses = DATA_TLB_MISSES_DTLB_MISS
+DTLB miss rate = DATA_TLB_MISSES_DTLB_MISS/INSTR_RETIRED_ANY
+--
+The DTLB miss rate gives a measure how often a TLB miss occurred per instruction.
+
diff --git a/collectors/likwid/groups/broadwell/BRANCH.txt b/collectors/likwid/groups/broadwell/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/collectors/likwid/groups/broadwell/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/broadwell/CLOCK.txt b/collectors/likwid/groups/broadwell/CLOCK.txt
new file mode 100644
index 0000000..b81bee6
--- /dev/null
+++ b/collectors/likwid/groups/broadwell/CLOCK.txt
@@ -0,0 +1,26 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+UBOXFIX UNCORE_CLOCK
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+Uncore Clock [MHz] 1.E-06*UBOXFIX/time
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+
+LONG
+Formulas:
+Power =  PWR_PKG_ENERGY / time
+Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/collectors/likwid/groups/broadwell/CYCLE_ACTIVITY.txt b/collectors/likwid/groups/broadwell/CYCLE_ACTIVITY.txt
new file mode 100644
index 0000000..c432a44
--- /dev/null
+++ b/collectors/likwid/groups/broadwell/CYCLE_ACTIVITY.txt
@@ -0,0 +1,38 @@
+SHORT Cycle Activities
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING
+PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING
+PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING
+PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Cycles without execution [%] (PMC3/FIXC1)*100
+Cycles without execution due to L1D [%] (PMC2/FIXC1)*100
+Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
+Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100
+
+LONG
+Formulas:
+Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
+Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
+Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
+Cycles without execution due to memory loads [%] = CYCLE_ACTIVITY_CYCLES_LDM_PENDING/CPU_CLK_UNHALTED_CORE*100
+--
+This performance group measures the cycles while waiting for data from the cache
+and memory hierarchy.
+CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on
+any execution port.
+CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is
+outstanding.
+CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is
+outstanding.
+CYCLE_ACTIVITY_CYCLES_LDM_PENDING: Cycles while memory subsystem has an
+outstanding load.
diff --git a/collectors/likwid/groups/broadwell/CYCLE_STALLS.txt b/collectors/likwid/groups/broadwell/CYCLE_STALLS.txt
new file mode 100644
index 0000000..795aeb9
--- /dev/null
+++ b/collectors/likwid/groups/broadwell/CYCLE_STALLS.txt
@@ -0,0 +1,45 @@
+SHORT Cycle Activities (Stalls)
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING
+PMC1 CYCLE_ACTIVITY_STALLS_LDM_PENDING
+PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING
+PMC3 CYCLE_ACTIVITY_STALLS_TOTAL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Total execution stalls PMC3
+Stalls caused by L1D misses [%] (PMC2/PMC3)*100
+Stalls caused by L2 misses [%] (PMC0/PMC3)*100
+Stalls caused by memory loads [%] (PMC1/PMC3)*100
+Execution stall rate [%] (PMC3/FIXC1)*100
+Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100
+Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
+Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100
+
+LONG
+Formulas:
+Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
+Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Stalls caused by memory loads [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by memory loads rate [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CPU_CLK_UNHALTED_CORE)*100
+--
+This performance group measures the stalls caused by data traffic in the cache
+hierarchy.
+CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls.
+CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand
+load is outstanding.
+CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand
+load is outstanding.
+CYCLE_ACTIVITY_STALLS_LDM_PENDING: Execution stalls while memory subsystem has
+an outstanding load.
diff --git a/collectors/likwid/groups/broadwell/DATA.txt b/collectors/likwid/groups/broadwell/DATA.txt
new file mode 100644
index 0000000..6955eb7
--- /dev/null
+++ b/collectors/likwid/groups/broadwell/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOPS_RETIRED_LOADS_ALL
+PMC1  MEM_UOPS_RETIRED_STORES_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOPS_RETIRED_LOADS_ALL/MEM_UOPS_RETIRED_STORES_ALL
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/broadwell/DIVIDE.txt b/collectors/likwid/groups/broadwell/DIVIDE.txt
new file mode 100644
index 0000000..c7c5fb2
--- /dev/null
+++ b/collectors/likwid/groups/broadwell/DIVIDE.txt
@@ -0,0 +1,24 @@
+SHORT Divide unit information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0:EDGEDETECT ARITH_FPU_DIV_ACTIVE
+PMC1 ARITH_FPU_DIV_ACTIVE
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Number of divide ops PMC0:EDGEDETECT
+Avg. divide unit usage duration PMC1/PMC0:EDGEDETECT
+
+LONG
+Formulas:
+Number of divide ops = ARITH_FPU_DIV_ACTIVE:EDGEDETECT
+Avg. divide unit usage duration = ARITH_FPU_DIV_ACTIVE/ARITH_FPU_DIV_ACTIVE:EDGEDETECT
+-
+This performance group measures the average latency of divide operations
diff --git a/collectors/likwid/groups/broadwell/ENERGY.txt b/collectors/likwid/groups/broadwell/ENERGY.txt
new file mode 100644
index 0000000..09eaeb1
--- /dev/null
+++ b/collectors/likwid/groups/broadwell/ENERGY.txt
@@ -0,0 +1,39 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR2  PWR_PP1_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
+Energy PP1 [J]  PWR2
+Power PP1 [W] PWR2/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formulas:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power PP1 = PWR_PP1_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket)  and DRAM level.
+
diff --git a/collectors/likwid/groups/broadwell/FALSE_SHARE.txt b/collectors/likwid/groups/broadwell/FALSE_SHARE.txt
new file mode 100644
index 0000000..a297654
--- /dev/null
+++ b/collectors/likwid/groups/broadwell/FALSE_SHARE.txt
@@ -0,0 +1,25 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM
+PMC2 MEM_UOPS_RETIRED_LOADS_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local LLC false sharing [MByte] 1.E-06*PMC0*64
+Local LLC false sharing rate PMC0/PMC2
+
+LONG
+Formulas:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM/MEM_UOPS_RETIRED_LOADS_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory load UOPs as reference.
diff --git a/collectors/likwid/groups/broadwell/FLOPS_AVX.txt b/collectors/likwid/groups/broadwell/FLOPS_AVX.txt
new file mode 100644
index 0000000..7854608
--- /dev/null
+++ b/collectors/likwid/groups/broadwell/FLOPS_AVX.txt
@@ -0,0 +1,24 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0)/time
+Packed DP [MFLOP/s]  1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formulas:
+Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+-
+FLOP rates of 256 bit packed floating-point instructions
+
diff --git a/collectors/likwid/groups/broadwell/FLOPS_DP.txt b/collectors/likwid/groups/broadwell/FLOPS_DP.txt
new file mode 100644
index 0000000..348ec76
--- /dev/null
+++ b/collectors/likwid/groups/broadwell/FLOPS_DP.txt
@@ -0,0 +1,31 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)
+-
+AVX/SSE scalar and packed double precision FLOP rates.
+
diff --git a/collectors/likwid/groups/broadwell/FLOPS_SP.txt b/collectors/likwid/groups/broadwell/FLOPS_SP.txt
new file mode 100644
index 0000000..1d7fd7c
--- /dev/null
+++ b/collectors/likwid/groups/broadwell/FLOPS_SP.txt
@@ -0,0 +1,31 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
+
+LONG
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)
+-
+AVX/SSE scalar and packed single precision FLOP rates.
+
diff --git a/collectors/likwid/groups/broadwell/ICACHE.txt b/collectors/likwid/groups/broadwell/ICACHE.txt
new file mode 100644
index 0000000..5f11ad6
--- /dev/null
+++ b/collectors/likwid/groups/broadwell/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+-
+This group measures some L1 instruction cache metrics.
diff --git a/collectors/likwid/groups/broadwell/L2.txt b/collectors/likwid/groups/broadwell/L2.txt
new file mode 100644
index 0000000..60c7f79
--- /dev/null
+++ b/collectors/likwid/groups/broadwell/L2.txt
@@ -0,0 +1,37 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line loaded from the L2 to the L2 data cache and the writebacks from
+the L2 data cache to the L2 cache. The group also outputs total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and cache lines transferred it the instruction
+cache.
diff --git a/collectors/likwid/groups/broadwell/L2CACHE.txt b/collectors/likwid/groups/broadwell/L2CACHE.txt
new file mode 100644
index 0000000..9b5dd4b
--- /dev/null
+++ b/collectors/likwid/groups/broadwell/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_TRANS_ALL_REQUESTS
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/broadwell/L3.txt b/collectors/likwid/groups/broadwell/L3.txt
new file mode 100644
index 0000000..98d1d9e
--- /dev/null
+++ b/collectors/likwid/groups/broadwell/L3.txt
@@ -0,0 +1,36 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ALL
+PMC1  L2_TRANS_L2_WB
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/collectors/likwid/groups/broadwell/L3CACHE.txt b/collectors/likwid/groups/broadwell/L3CACHE.txt
new file mode 100644
index 0000000..f863daa
--- /dev/null
+++ b/collectors/likwid/groups/broadwell/L3CACHE.txt
@@ -0,0 +1,35 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_LOAD_UOPS_RETIRED_L3_ALL
+PMC1  MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate PMC0/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/broadwell/PORT_USAGE.txt b/collectors/likwid/groups/broadwell/PORT_USAGE.txt
new file mode 100644
index 0000000..298df1d
--- /dev/null
+++ b/collectors/likwid/groups/broadwell/PORT_USAGE.txt
@@ -0,0 +1,50 @@
+SHORT  Execution port utilization
+
+REQUIRE_NOHT
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_EXECUTED_PORT_PORT_0
+PMC1  UOPS_EXECUTED_PORT_PORT_1
+PMC2  UOPS_EXECUTED_PORT_PORT_2
+PMC3  UOPS_EXECUTED_PORT_PORT_3
+PMC4  UOPS_EXECUTED_PORT_PORT_4
+PMC5  UOPS_EXECUTED_PORT_PORT_5
+PMC6  UOPS_EXECUTED_PORT_PORT_6
+PMC7  UOPS_EXECUTED_PORT_PORT_7
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Port0 usage ratio PMC0/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port1 usage ratio PMC1/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port2 usage ratio PMC2/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port3 usage ratio PMC3/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port4 usage ratio PMC4/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port5 usage ratio PMC5/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port6 usage ratio PMC6/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port7 usage ratio PMC7/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+
+LONG
+Formulas:
+Port0 usage ratio = UOPS_EXECUTED_PORT_PORT_0/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port1 usage ratio = UOPS_EXECUTED_PORT_PORT_1/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port2 usage ratio = UOPS_EXECUTED_PORT_PORT_2/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port3 usage ratio = UOPS_EXECUTED_PORT_PORT_3/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port4 usage ratio = UOPS_EXECUTED_PORT_PORT_4/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port5 usage ratio = UOPS_EXECUTED_PORT_PORT_5/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port6 usage ratio = UOPS_EXECUTED_PORT_PORT_6/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port7 usage ratio = UOPS_EXECUTED_PORT_PORT_7/SUM(UOPS_EXECUTED_PORT_PORT_*)
+-
+This group measures the execution port utilization in a CPU core. The group can
+only be measured when HyperThreading is disabled because only then each CPU core
+can program eight counters.
+Please be aware that the counters PMC4-7 are broken on Intel Broadwell. They
+don't increment if either user- or kernel-level filtering is applied. User-level
+filtering is default in LIKWID, hence kernel-level filtering is added
+automatically for PMC4-7. The returned counts can be much higher.
diff --git a/collectors/likwid/groups/broadwell/RECOVERY.txt b/collectors/likwid/groups/broadwell/RECOVERY.txt
new file mode 100644
index 0000000..7928ee4
--- /dev/null
+++ b/collectors/likwid/groups/broadwell/RECOVERY.txt
@@ -0,0 +1,22 @@
+SHORT  Recovery duration
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  INT_MISC_RECOVERY_CYCLES
+PMC1  INT_MISC_RECOVERY_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Avg. recovery duration PMC0/PMC1
+
+LONG
+Formulas:
+Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT
+-
+This group measures the duration of recoveries after SSE exception, memory
+disambiguation, etc...
diff --git a/collectors/likwid/groups/broadwell/TLB_DATA.txt b/collectors/likwid/groups/broadwell/TLB_DATA.txt
new file mode 100644
index 0000000..8d94e05
--- /dev/null
+++ b/collectors/likwid/groups/broadwell/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1  DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2  DTLB_LOAD_MISSES_WALK_DURATION
+PMC3  DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses     PMC1
+L1 DTLB store miss rate  PMC1/FIXC0
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/broadwell/TLB_INSTR.txt b/collectors/likwid/groups/broadwell/TLB_INSTR.txt
new file mode 100644
index 0000000..235d977
--- /dev/null
+++ b/collectors/likwid/groups/broadwell/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_CAUSES_A_WALK
+PMC1  ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/broadwell/TMA.txt b/collectors/likwid/groups/broadwell/TMA.txt
new file mode 100644
index 0000000..afb4126
--- /dev/null
+++ b/collectors/likwid/groups/broadwell/TMA.txt
@@ -0,0 +1,48 @@
+SHORT Top down cycle allocation
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_ANY
+PMC1 UOPS_RETIRED_RETIRE_SLOTS
+PMC2 IDQ_UOPS_NOT_DELIVERED_CORE
+PMC3 INT_MISC_RECOVERY_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+IPC FIXC0/FIXC1
+Total Slots 4*FIXC1
+Slots Retired PMC1
+Fetch Bubbles PMC2
+Recovery Bubbles 4*PMC3
+Front End [%] PMC2/(4*FIXC1)*100
+Speculation [%] (PMC0-PMC1+(4*PMC3))/(4*FIXC1)*100
+Retiring [%] PMC1/(4*FIXC1)*100
+Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100
+
+LONG
+Formulas:
+Total Slots = 4*CPU_CLK_UNHALTED_CORE
+Slots Retired = UOPS_RETIRED_RETIRE_SLOTS
+Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE
+Recovery Bubbles = 4*INT_MISC_RECOVERY_CYCLES
+Front End [%] = IDQ_UOPS_NOT_DELIVERED_CORE/(4*CPU_CLK_UNHALTED_CORE)*100
+Speculation [%] = (UOPS_ISSUED_ANY-UOPS_RETIRED_RETIRE_SLOTS+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)*100
+Retiring [%] = UOPS_RETIRED_RETIRE_SLOTS/(4*CPU_CLK_UNHALTED_CORE)*100
+Back End [%] = (1-((IDQ_UOPS_NOT_DELIVERED_CORE+UOPS_ISSUED_ANY+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)))*100
+--
+This performance group measures cycles to determine percentage of time spent in
+front end, back end, retiring and speculation. These metrics are published and
+verified by Intel. Further information:
+Webpage describing Top-Down Method and its usage in Intel vTune:
+https://software.intel.com/en-us/vtune-amplifier-help-tuning-applications-using-a-top-down-microarchitecture-analysis-method
+Paper by Yasin Ahmad:
+https://sites.google.com/site/analysismethods/yasin-pubs/TopDown-Yasin-ISPASS14.pdf?attredirects=0
+Slides by Yasin Ahmad:
+http://www.cs.technion.ac.il/~erangi/TMA_using_Linux_perf__Ahmad_Yasin.pdf
+The performance group was originally published here:
+http://perf.mvermeulen.com/2018/04/14/top-down-performance-counter-analysis-part-1-likwid/
diff --git a/collectors/likwid/groups/broadwell/UOPS.txt b/collectors/likwid/groups/broadwell/UOPS.txt
new file mode 100644
index 0000000..e6cc208
--- /dev/null
+++ b/collectors/likwid/groups/broadwell/UOPS.txt
@@ -0,0 +1,35 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_ANY
+PMC1  UOPS_EXECUTED_THREAD
+PMC2  UOPS_RETIRED_ALL
+PMC3  UOPS_ISSUED_FLAGS_MERGE
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Issued UOPs PMC0
+Merged UOPs PMC3
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formulas:
+Issued UOPs = UOPS_ISSUED_ANY
+Merged UOPs = UOPS_ISSUED_FLAGS_MERGE
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs and returns the number of uOPs which were issued
+but not executed as well as the number of uOPs which were executed but never retired.
+The executed but not retired uOPs commonly come from speculatively executed branches.
+
diff --git a/collectors/likwid/groups/broadwellD/BRANCH.txt b/collectors/likwid/groups/broadwellD/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/collectors/likwid/groups/broadwellD/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/broadwellD/CACHES.txt b/collectors/likwid/groups/broadwellD/CACHES.txt
new file mode 100644
index 0000000..275e30f
--- /dev/null
+++ b/collectors/likwid/groups/broadwellD/CACHES.txt
@@ -0,0 +1,123 @@
+SHORT Cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_TRANS_L2_WB
+CBOX0C1 LLC_VICTIMS_M
+CBOX1C1 LLC_VICTIMS_M
+CBOX2C1 LLC_VICTIMS_M
+CBOX3C1 LLC_VICTIMS_M
+CBOX4C1 LLC_VICTIMS_M
+CBOX5C1 LLC_VICTIMS_M
+CBOX6C1 LLC_VICTIMS_M
+CBOX7C1 LLC_VICTIMS_M
+CBOX8C1 LLC_VICTIMS_M
+CBOX9C1 LLC_VICTIMS_M
+CBOX10C1 LLC_VICTIMS_M
+CBOX11C1 LLC_VICTIMS_M
+CBOX12C1 LLC_VICTIMS_M
+CBOX13C1 LLC_VICTIMS_M
+CBOX14C1 LLC_VICTIMS_M
+CBOX15C1 LLC_VICTIMS_M
+CBOX0C0 LLC_LOOKUP_DATA_READ
+CBOX1C0 LLC_LOOKUP_DATA_READ
+CBOX2C0 LLC_LOOKUP_DATA_READ
+CBOX3C0 LLC_LOOKUP_DATA_READ
+CBOX4C0 LLC_LOOKUP_DATA_READ
+CBOX5C0 LLC_LOOKUP_DATA_READ
+CBOX6C0 LLC_LOOKUP_DATA_READ
+CBOX7C0 LLC_LOOKUP_DATA_READ
+CBOX8C0 LLC_LOOKUP_DATA_READ
+CBOX9C0 LLC_LOOKUP_DATA_READ
+CBOX10C0 LLC_LOOKUP_DATA_READ
+CBOX11C0 LLC_LOOKUP_DATA_READ
+CBOX12C0 LLC_LOOKUP_DATA_READ
+CBOX13C0 LLC_LOOKUP_DATA_READ
+CBOX14C0 LLC_LOOKUP_DATA_READ
+CBOX15C0 LLC_LOOKUP_DATA_READ
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L3 to L2 load bandwidth [MBytes/s]  1.0E-06*PMC2*64.0/time
+L3 to L2 load data volume [GBytes]  1.0E-09*PMC2*64.0
+L2 to L3 evict bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time
+L2 to L3 evict data volume [GBytes]  1.0E-09*PMC3*64.0
+L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0)*64.0/time
+System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0)*64.0
+L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1)*64/time
+L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1)*64
+L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1)*64.0/time
+L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1)*64.0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
+L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64
+L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64/time
+L1 to L2 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64
+L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB)*64/time
+L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB)*64
+L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time
+L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64
+L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time
+L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64
+L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ))*64/time
+System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ))*64
+L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M))*64/time
+L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M))*64
+L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M))*64/time
+L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M))*64
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
+-
+Group to measure cache transfers between L1 and Memory. Please notice that the
+L3 to/from system metrics contain any traffic to the system (memory,
+Intel QPI, etc.) but don't seem to handle anything because commonly memory read
+bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth.
+
diff --git a/collectors/likwid/groups/broadwellD/CLOCK.txt b/collectors/likwid/groups/broadwellD/CLOCK.txt
new file mode 100644
index 0000000..b81bee6
--- /dev/null
+++ b/collectors/likwid/groups/broadwellD/CLOCK.txt
@@ -0,0 +1,26 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+UBOXFIX UNCORE_CLOCK
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+Uncore Clock [MHz] 1.E-06*UBOXFIX/time
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+
+LONG
+Formulas:
+Power =  PWR_PKG_ENERGY / time
+Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/collectors/likwid/groups/broadwellD/CYCLE_ACTIVITY.txt b/collectors/likwid/groups/broadwellD/CYCLE_ACTIVITY.txt
new file mode 100644
index 0000000..c432a44
--- /dev/null
+++ b/collectors/likwid/groups/broadwellD/CYCLE_ACTIVITY.txt
@@ -0,0 +1,38 @@
+SHORT Cycle Activities
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING
+PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING
+PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING
+PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Cycles without execution [%] (PMC3/FIXC1)*100
+Cycles without execution due to L1D [%] (PMC2/FIXC1)*100
+Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
+Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100
+
+LONG
+Formulas:
+Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
+Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
+Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
+Cycles without execution due to memory loads [%] = CYCLE_ACTIVITY_CYCLES_LDM_PENDING/CPU_CLK_UNHALTED_CORE*100
+--
+This performance group measures the cycles while waiting for data from the cache
+and memory hierarchy.
+CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on
+any execution port.
+CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is
+outstanding.
+CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is
+outstanding.
+CYCLE_ACTIVITY_CYCLES_LDM_PENDING: Cycles while memory subsystem has an
+outstanding load.
diff --git a/collectors/likwid/groups/broadwellD/CYCLE_STALLS.txt b/collectors/likwid/groups/broadwellD/CYCLE_STALLS.txt
new file mode 100644
index 0000000..795aeb9
--- /dev/null
+++ b/collectors/likwid/groups/broadwellD/CYCLE_STALLS.txt
@@ -0,0 +1,45 @@
+SHORT Cycle Activities (Stalls)
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING
+PMC1 CYCLE_ACTIVITY_STALLS_LDM_PENDING
+PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING
+PMC3 CYCLE_ACTIVITY_STALLS_TOTAL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Total execution stalls PMC3
+Stalls caused by L1D misses [%] (PMC2/PMC3)*100
+Stalls caused by L2 misses [%] (PMC0/PMC3)*100
+Stalls caused by memory loads [%] (PMC1/PMC3)*100
+Execution stall rate [%] (PMC3/FIXC1)*100
+Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100
+Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
+Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100
+
+LONG
+Formulas:
+Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
+Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Stalls caused by memory loads [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by memory loads rate [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CPU_CLK_UNHALTED_CORE)*100
+--
+This performance group measures the stalls caused by data traffic in the cache
+hierarchy.
+CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls.
+CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand
+load is outstanding.
+CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand
+load is outstanding.
+CYCLE_ACTIVITY_STALLS_LDM_PENDING: Execution stalls while memory subsystem has
+an outstanding load.
diff --git a/collectors/likwid/groups/broadwellD/DATA.txt b/collectors/likwid/groups/broadwellD/DATA.txt
new file mode 100644
index 0000000..6955eb7
--- /dev/null
+++ b/collectors/likwid/groups/broadwellD/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOPS_RETIRED_LOADS_ALL
+PMC1  MEM_UOPS_RETIRED_STORES_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOPS_RETIRED_LOADS_ALL/MEM_UOPS_RETIRED_STORES_ALL
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/broadwellD/DIVIDE.txt b/collectors/likwid/groups/broadwellD/DIVIDE.txt
new file mode 100644
index 0000000..c7c5fb2
--- /dev/null
+++ b/collectors/likwid/groups/broadwellD/DIVIDE.txt
@@ -0,0 +1,24 @@
+SHORT Divide unit information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0:EDGEDETECT ARITH_FPU_DIV_ACTIVE
+PMC1 ARITH_FPU_DIV_ACTIVE
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Number of divide ops PMC0:EDGEDETECT
+Avg. divide unit usage duration PMC1/PMC0:EDGEDETECT
+
+LONG
+Formulas:
+Number of divide ops = ARITH_FPU_DIV_ACTIVE:EDGEDETECT
+Avg. divide unit usage duration = ARITH_FPU_DIV_ACTIVE/ARITH_FPU_DIV_ACTIVE:EDGEDETECT
+-
+This performance group measures the average latency of divide operations
diff --git a/collectors/likwid/groups/broadwellD/ENERGY.txt b/collectors/likwid/groups/broadwellD/ENERGY.txt
new file mode 100644
index 0000000..09eaeb1
--- /dev/null
+++ b/collectors/likwid/groups/broadwellD/ENERGY.txt
@@ -0,0 +1,39 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR2  PWR_PP1_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
+Energy PP1 [J]  PWR2
+Power PP1 [W] PWR2/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formulas:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power PP1 = PWR_PP1_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket)  and DRAM level.
+
diff --git a/collectors/likwid/groups/broadwellD/FALSE_SHARE.txt b/collectors/likwid/groups/broadwellD/FALSE_SHARE.txt
new file mode 100644
index 0000000..68107bf
--- /dev/null
+++ b/collectors/likwid/groups/broadwellD/FALSE_SHARE.txt
@@ -0,0 +1,25 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM
+PMC2 MEM_UOPS_RETIRED_LOADS_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local LLC false sharing [MByte] 1.E-06*PMC0*64
+Local LLC false sharing rate PMC0/PMC2
+
+LONG
+Formulas:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM/MEM_UOPS_RETIRED_LOADS_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory loads as reference.
diff --git a/collectors/likwid/groups/broadwellD/FLOPS_AVX.txt b/collectors/likwid/groups/broadwellD/FLOPS_AVX.txt
new file mode 100644
index 0000000..7854608
--- /dev/null
+++ b/collectors/likwid/groups/broadwellD/FLOPS_AVX.txt
@@ -0,0 +1,24 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0)/time
+Packed DP [MFLOP/s]  1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formulas:
+Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+-
+FLOP rates of 256 bit packed floating-point instructions
+
diff --git a/collectors/likwid/groups/broadwellD/FLOPS_DP.txt b/collectors/likwid/groups/broadwellD/FLOPS_DP.txt
new file mode 100644
index 0000000..348ec76
--- /dev/null
+++ b/collectors/likwid/groups/broadwellD/FLOPS_DP.txt
@@ -0,0 +1,31 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)
+-
+AVX/SSE scalar and packed double precision FLOP rates.
+
diff --git a/collectors/likwid/groups/broadwellD/FLOPS_SP.txt b/collectors/likwid/groups/broadwellD/FLOPS_SP.txt
new file mode 100644
index 0000000..1d7fd7c
--- /dev/null
+++ b/collectors/likwid/groups/broadwellD/FLOPS_SP.txt
@@ -0,0 +1,31 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
+
+LONG
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)
+-
+AVX/SSE scalar and packed single precision FLOP rates.
+
diff --git a/collectors/likwid/groups/broadwellD/HA.txt b/collectors/likwid/groups/broadwellD/HA.txt
new file mode 100644
index 0000000..1e5a700
--- /dev/null
+++ b/collectors/likwid/groups/broadwellD/HA.txt
@@ -0,0 +1,40 @@
+SHORT Main memory bandwidth in MBytes/s seen from Home agent
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+BBOX0C0 IMC_READS_NORMAL
+BBOX0C1 BYPASS_IMC_TAKEN
+BBOX0C2 IMC_WRITES_ALL
+BBOX1C0 IMC_READS_NORMAL
+BBOX1C1 BYPASS_IMC_TAKEN
+BBOX1C2 IMC_WRITES_ALL
+
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(BBOX0C2+BBOX1C2)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(BBOX0C2+BBOX1C2)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1+BBOX0C2+BBOX1C2)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1+BBOX0C2+BBOX1C2)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_READS_NORMAL)+SUM(BYPASS_IMC_TAKEN))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(IMC_READS_NORMAL)+SUM(BYPASS_IMC_TAKEN))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_WRITES_ALL))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(IMC_WRITES_ALL))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_READS_NORMAL) + SUM(BYPASS_IMC_TAKEN) + SUM(IMC_WRITES_ALL))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(IMC_READS_NORMAL) + SUM(BYPASS_IMC_TAKEN) + SUM(IMC_WRITES_ALL))*64.0
+-
+This group derives the same metrics as the MEM group but use the events of the
+Home Agent, a central unit that is responsible for the protocol side of memory
+interactions.
diff --git a/collectors/likwid/groups/broadwellD/ICACHE.txt b/collectors/likwid/groups/broadwellD/ICACHE.txt
new file mode 100644
index 0000000..5f11ad6
--- /dev/null
+++ b/collectors/likwid/groups/broadwellD/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+-
+This group measures some L1 instruction cache metrics.
diff --git a/collectors/likwid/groups/broadwellD/L2.txt b/collectors/likwid/groups/broadwellD/L2.txt
new file mode 100644
index 0000000..60c7f79
--- /dev/null
+++ b/collectors/likwid/groups/broadwellD/L2.txt
@@ -0,0 +1,37 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line loaded from the L2 to the L2 data cache and the writebacks from
+the L2 data cache to the L2 cache. The group also outputs total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and cache lines transferred it the instruction
+cache.
diff --git a/collectors/likwid/groups/broadwellD/L2CACHE.txt b/collectors/likwid/groups/broadwellD/L2CACHE.txt
new file mode 100644
index 0000000..9b5dd4b
--- /dev/null
+++ b/collectors/likwid/groups/broadwellD/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_TRANS_ALL_REQUESTS
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/broadwellD/L3.txt b/collectors/likwid/groups/broadwellD/L3.txt
new file mode 100644
index 0000000..98d1d9e
--- /dev/null
+++ b/collectors/likwid/groups/broadwellD/L3.txt
@@ -0,0 +1,36 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ALL
+PMC1  L2_TRANS_L2_WB
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/collectors/likwid/groups/broadwellD/L3CACHE.txt b/collectors/likwid/groups/broadwellD/L3CACHE.txt
new file mode 100644
index 0000000..f863daa
--- /dev/null
+++ b/collectors/likwid/groups/broadwellD/L3CACHE.txt
@@ -0,0 +1,35 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_LOAD_UOPS_RETIRED_L3_ALL
+PMC1  MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate PMC0/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/broadwellD/MEM.txt b/collectors/likwid/groups/broadwellD/MEM.txt
new file mode 100644
index 0000000..2a17a2c
--- /dev/null
+++ b/collectors/likwid/groups/broadwellD/MEM.txt
@@ -0,0 +1,52 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on a
+per socket base. Some of the counters may not be available on your system.
+Also outputs total data volume transferred from main memory.
+The same metrics are provided by the HA group.
+
diff --git a/collectors/likwid/groups/broadwellD/MEM_DP.txt b/collectors/likwid/groups/broadwellD/MEM_DP.txt
new file mode 100644
index 0000000..71ce2ae
--- /dev/null
+++ b/collectors/likwid/groups/broadwellD/MEM_DP.txt
@@ -0,0 +1,73 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX [MFLOP/s] 1.0E-06*(PMC2*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Operational intensity (PMC0*2.0+PMC1+PMC2*4.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0)
+
+LONG
+Formulas:
+Power [W] = PWR_PKG_ENERGY/runtime
+Power DRAM [W] = PWR_DRAM_ENERGY/runtime
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/((SUM(MBOXxC0)+SUM(MBOXxC1))*64.0)
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed double precision FLOP rates. Also reports on packed AVX
+32b instructions.
+The operational intensity is calculated using the FP values of the cores and the
+memory data volume of the whole socket. The actual operational intensity for
+multiple CPUs can be found in the statistics table in the Sum column.
diff --git a/collectors/likwid/groups/broadwellD/MEM_SP.txt b/collectors/likwid/groups/broadwellD/MEM_SP.txt
new file mode 100644
index 0000000..6d67ea7
--- /dev/null
+++ b/collectors/likwid/groups/broadwellD/MEM_SP.txt
@@ -0,0 +1,73 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX [MFLOP/s] 1.0E-06*(PMC2*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Operational intensity (PMC0*4.0+PMC1+PMC2*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0)
+
+LONG
+Formulas:
+Power [W] = PWR_PKG_ENERGY/runtime
+Power DRAM [W] = PWR_DRAM_ENERGY/runtime
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+AVX [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/((SUM(MBOXxC0)+SUM(MBOXxC1))*64.0)
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed single precision FLOP rates. Also reports on packed AVX
+32b instructions.
+The operational intensity is calculated using the FP values of the cores and the
+memory data volume of the whole socket. The actual operational intensity for
+multiple CPUs can be found in the statistics table in the Sum column.
diff --git a/collectors/likwid/groups/broadwellD/PORT_USAGE.txt b/collectors/likwid/groups/broadwellD/PORT_USAGE.txt
new file mode 100644
index 0000000..298df1d
--- /dev/null
+++ b/collectors/likwid/groups/broadwellD/PORT_USAGE.txt
@@ -0,0 +1,50 @@
+SHORT  Execution port utilization
+
+REQUIRE_NOHT
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_EXECUTED_PORT_PORT_0
+PMC1  UOPS_EXECUTED_PORT_PORT_1
+PMC2  UOPS_EXECUTED_PORT_PORT_2
+PMC3  UOPS_EXECUTED_PORT_PORT_3
+PMC4  UOPS_EXECUTED_PORT_PORT_4
+PMC5  UOPS_EXECUTED_PORT_PORT_5
+PMC6  UOPS_EXECUTED_PORT_PORT_6
+PMC7  UOPS_EXECUTED_PORT_PORT_7
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Port0 usage ratio PMC0/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port1 usage ratio PMC1/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port2 usage ratio PMC2/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port3 usage ratio PMC3/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port4 usage ratio PMC4/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port5 usage ratio PMC5/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port6 usage ratio PMC6/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port7 usage ratio PMC7/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+
+LONG
+Formulas:
+Port0 usage ratio = UOPS_EXECUTED_PORT_PORT_0/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port1 usage ratio = UOPS_EXECUTED_PORT_PORT_1/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port2 usage ratio = UOPS_EXECUTED_PORT_PORT_2/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port3 usage ratio = UOPS_EXECUTED_PORT_PORT_3/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port4 usage ratio = UOPS_EXECUTED_PORT_PORT_4/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port5 usage ratio = UOPS_EXECUTED_PORT_PORT_5/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port6 usage ratio = UOPS_EXECUTED_PORT_PORT_6/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port7 usage ratio = UOPS_EXECUTED_PORT_PORT_7/SUM(UOPS_EXECUTED_PORT_PORT_*)
+-
+This group measures the execution port utilization in a CPU core. The group can
+only be measured when HyperThreading is disabled because only then each CPU core
+can program eight counters.
+Please be aware that the counters PMC4-7 are broken on Intel Broadwell. They
+don't increment if either user- or kernel-level filtering is applied. User-level
+filtering is default in LIKWID, hence kernel-level filtering is added
+automatically for PMC4-7. The returned counts can be much higher.
diff --git a/collectors/likwid/groups/broadwellD/RECOVERY.txt b/collectors/likwid/groups/broadwellD/RECOVERY.txt
new file mode 100644
index 0000000..7928ee4
--- /dev/null
+++ b/collectors/likwid/groups/broadwellD/RECOVERY.txt
@@ -0,0 +1,22 @@
+SHORT  Recovery duration
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  INT_MISC_RECOVERY_CYCLES
+PMC1  INT_MISC_RECOVERY_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Avg. recovery duration PMC0/PMC1
+
+LONG
+Formulas:
+Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT
+-
+This group measures the duration of recoveries after SSE exception, memory
+disambiguation, etc...
diff --git a/collectors/likwid/groups/broadwellD/TLB_DATA.txt b/collectors/likwid/groups/broadwellD/TLB_DATA.txt
new file mode 100644
index 0000000..8d94e05
--- /dev/null
+++ b/collectors/likwid/groups/broadwellD/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1  DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2  DTLB_LOAD_MISSES_WALK_DURATION
+PMC3  DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses     PMC1
+L1 DTLB store miss rate  PMC1/FIXC0
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/broadwellD/TLB_INSTR.txt b/collectors/likwid/groups/broadwellD/TLB_INSTR.txt
new file mode 100644
index 0000000..235d977
--- /dev/null
+++ b/collectors/likwid/groups/broadwellD/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_CAUSES_A_WALK
+PMC1  ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/broadwellD/TMA.txt b/collectors/likwid/groups/broadwellD/TMA.txt
new file mode 100644
index 0000000..afb4126
--- /dev/null
+++ b/collectors/likwid/groups/broadwellD/TMA.txt
@@ -0,0 +1,48 @@
+SHORT Top down cycle allocation
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_ANY
+PMC1 UOPS_RETIRED_RETIRE_SLOTS
+PMC2 IDQ_UOPS_NOT_DELIVERED_CORE
+PMC3 INT_MISC_RECOVERY_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+IPC FIXC0/FIXC1
+Total Slots 4*FIXC1
+Slots Retired PMC1
+Fetch Bubbles PMC2
+Recovery Bubbles 4*PMC3
+Front End [%] PMC2/(4*FIXC1)*100
+Speculation [%] (PMC0-PMC1+(4*PMC3))/(4*FIXC1)*100
+Retiring [%] PMC1/(4*FIXC1)*100
+Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100
+
+LONG
+Formulas:
+Total Slots = 4*CPU_CLK_UNHALTED_CORE
+Slots Retired = UOPS_RETIRED_RETIRE_SLOTS
+Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE
+Recovery Bubbles = 4*INT_MISC_RECOVERY_CYCLES
+Front End [%] = IDQ_UOPS_NOT_DELIVERED_CORE/(4*CPU_CLK_UNHALTED_CORE)*100
+Speculation [%] = (UOPS_ISSUED_ANY-UOPS_RETIRED_RETIRE_SLOTS+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)*100
+Retiring [%] = UOPS_RETIRED_RETIRE_SLOTS/(4*CPU_CLK_UNHALTED_CORE)*100
+Back End [%] = (1-((IDQ_UOPS_NOT_DELIVERED_CORE+UOPS_ISSUED_ANY+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)))*100
+--
+This performance group measures cycles to determine percentage of time spent in
+front end, back end, retiring and speculation. These metrics are published and
+verified by Intel. Further information:
+Webpage describing Top-Down Method and its usage in Intel vTune:
+https://software.intel.com/en-us/vtune-amplifier-help-tuning-applications-using-a-top-down-microarchitecture-analysis-method
+Paper by Yasin Ahmad:
+https://sites.google.com/site/analysismethods/yasin-pubs/TopDown-Yasin-ISPASS14.pdf?attredirects=0
+Slides by Yasin Ahmad:
+http://www.cs.technion.ac.il/~erangi/TMA_using_Linux_perf__Ahmad_Yasin.pdf
+The performance group was originally published here:
+http://perf.mvermeulen.com/2018/04/14/top-down-performance-counter-analysis-part-1-likwid/
diff --git a/collectors/likwid/groups/broadwellD/UOPS.txt b/collectors/likwid/groups/broadwellD/UOPS.txt
new file mode 100644
index 0000000..e6cc208
--- /dev/null
+++ b/collectors/likwid/groups/broadwellD/UOPS.txt
@@ -0,0 +1,35 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_ANY
+PMC1  UOPS_EXECUTED_THREAD
+PMC2  UOPS_RETIRED_ALL
+PMC3  UOPS_ISSUED_FLAGS_MERGE
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Issued UOPs PMC0
+Merged UOPs PMC3
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formulas:
+Issued UOPs = UOPS_ISSUED_ANY
+Merged UOPs = UOPS_ISSUED_FLAGS_MERGE
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs and returns the number of uOPs which were issued
+but not executed as well as the number of uOPs which were executed but never retired.
+The executed but not retired uOPs commonly come from speculatively executed branches.
+
diff --git a/collectors/likwid/groups/broadwellEP/BRANCH.txt b/collectors/likwid/groups/broadwellEP/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/broadwellEP/CACHES.txt b/collectors/likwid/groups/broadwellEP/CACHES.txt
new file mode 100644
index 0000000..6a14e52
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/CACHES.txt
@@ -0,0 +1,135 @@
+SHORT Cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_TRANS_L2_WB
+CBOX0C1 LLC_VICTIMS_M
+CBOX1C1 LLC_VICTIMS_M
+CBOX2C1 LLC_VICTIMS_M
+CBOX3C1 LLC_VICTIMS_M
+CBOX4C1 LLC_VICTIMS_M
+CBOX5C1 LLC_VICTIMS_M
+CBOX6C1 LLC_VICTIMS_M
+CBOX7C1 LLC_VICTIMS_M
+CBOX8C1 LLC_VICTIMS_M
+CBOX9C1 LLC_VICTIMS_M
+CBOX10C1 LLC_VICTIMS_M
+CBOX11C1 LLC_VICTIMS_M
+CBOX12C1 LLC_VICTIMS_M
+CBOX13C1 LLC_VICTIMS_M
+CBOX14C1 LLC_VICTIMS_M
+CBOX15C1 LLC_VICTIMS_M
+CBOX16C1 LLC_VICTIMS_M
+CBOX17C1 LLC_VICTIMS_M
+CBOX18C1 LLC_VICTIMS_M
+CBOX19C1 LLC_VICTIMS_M
+CBOX20C1 LLC_VICTIMS_M
+CBOX21C1 LLC_VICTIMS_M
+CBOX0C0 LLC_LOOKUP_DATA_READ
+CBOX1C0 LLC_LOOKUP_DATA_READ
+CBOX2C0 LLC_LOOKUP_DATA_READ
+CBOX3C0 LLC_LOOKUP_DATA_READ
+CBOX4C0 LLC_LOOKUP_DATA_READ
+CBOX5C0 LLC_LOOKUP_DATA_READ
+CBOX6C0 LLC_LOOKUP_DATA_READ
+CBOX7C0 LLC_LOOKUP_DATA_READ
+CBOX8C0 LLC_LOOKUP_DATA_READ
+CBOX9C0 LLC_LOOKUP_DATA_READ
+CBOX10C0 LLC_LOOKUP_DATA_READ
+CBOX11C0 LLC_LOOKUP_DATA_READ
+CBOX12C0 LLC_LOOKUP_DATA_READ
+CBOX13C0 LLC_LOOKUP_DATA_READ
+CBOX14C0 LLC_LOOKUP_DATA_READ
+CBOX15C0 LLC_LOOKUP_DATA_READ
+CBOX16C0 LLC_LOOKUP_DATA_READ
+CBOX17C0 LLC_LOOKUP_DATA_READ
+CBOX18C0 LLC_LOOKUP_DATA_READ
+CBOX19C0 LLC_LOOKUP_DATA_READ
+CBOX20C0 LLC_LOOKUP_DATA_READ
+CBOX21C0 LLC_LOOKUP_DATA_READ
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L3 to L2 load bandwidth [MBytes/s]  1.0E-06*PMC2*64.0/time
+L3 to L2 load data volume [GBytes]  1.0E-09*PMC2*64.0
+L2 to L3 evict bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time
+L2 to L3 evict data volume [GBytes]  1.0E-09*PMC3*64.0
+L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0)*64.0/time
+System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0)*64.0
+L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1)*64/time
+L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1)*64
+L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1)*64.0/time
+L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1)*64.0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
+L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64
+L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
+L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64
+L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
+L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
+L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time
+L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64
+L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time
+L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64
+L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ))*64/time
+System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ))*64
+L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M))*64/time
+L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M))*64
+L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M))*64/time
+L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M))*64
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
+-
+Group to measure cache transfers between L1 and Memory. Please notice that the
+L3 to/from system metrics contain any traffic to the system (memory,
+Intel QPI, etc.) but don't seem to handle anything because commonly memory read
+bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth.
+
diff --git a/collectors/likwid/groups/broadwellEP/CLOCK.txt b/collectors/likwid/groups/broadwellEP/CLOCK.txt
new file mode 100644
index 0000000..b81bee6
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/CLOCK.txt
@@ -0,0 +1,26 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+UBOXFIX UNCORE_CLOCK
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+Uncore Clock [MHz] 1.E-06*UBOXFIX/time
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+
+LONG
+Formulas:
+Power =  PWR_PKG_ENERGY / time
+Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/collectors/likwid/groups/broadwellEP/CYCLE_ACTIVITY.txt b/collectors/likwid/groups/broadwellEP/CYCLE_ACTIVITY.txt
new file mode 100644
index 0000000..c432a44
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/CYCLE_ACTIVITY.txt
@@ -0,0 +1,38 @@
+SHORT Cycle Activities
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING
+PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING
+PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING
+PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Cycles without execution [%] (PMC3/FIXC1)*100
+Cycles without execution due to L1D [%] (PMC2/FIXC1)*100
+Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
+Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100
+
+LONG
+Formulas:
+Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
+Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
+Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
+Cycles without execution due to memory loads [%] = CYCLE_ACTIVITY_CYCLES_LDM_PENDING/CPU_CLK_UNHALTED_CORE*100
+--
+This performance group measures the cycles while waiting for data from the cache
+and memory hierarchy.
+CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on
+any execution port.
+CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is
+outstanding.
+CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is
+outstanding.
+CYCLE_ACTIVITY_CYCLES_LDM_PENDING: Cycles while memory subsystem has an
+outstanding load.
diff --git a/collectors/likwid/groups/broadwellEP/CYCLE_STALLS.txt b/collectors/likwid/groups/broadwellEP/CYCLE_STALLS.txt
new file mode 100644
index 0000000..795aeb9
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/CYCLE_STALLS.txt
@@ -0,0 +1,45 @@
+SHORT Cycle Activities (Stalls)
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING
+PMC1 CYCLE_ACTIVITY_STALLS_LDM_PENDING
+PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING
+PMC3 CYCLE_ACTIVITY_STALLS_TOTAL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Total execution stalls PMC3
+Stalls caused by L1D misses [%] (PMC2/PMC3)*100
+Stalls caused by L2 misses [%] (PMC0/PMC3)*100
+Stalls caused by memory loads [%] (PMC1/PMC3)*100
+Execution stall rate [%] (PMC3/FIXC1)*100
+Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100
+Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
+Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100
+
+LONG
+Formulas:
+Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
+Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Stalls caused by memory loads [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by memory loads rate [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CPU_CLK_UNHALTED_CORE)*100
+--
+This performance group measures the stalls caused by data traffic in the cache
+hierarchy.
+CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls.
+CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand
+load is outstanding.
+CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand
+load is outstanding.
+CYCLE_ACTIVITY_STALLS_LDM_PENDING: Execution stalls while memory subsystem has
+an outstanding load.
diff --git a/collectors/likwid/groups/broadwellEP/DATA.txt b/collectors/likwid/groups/broadwellEP/DATA.txt
new file mode 100644
index 0000000..6955eb7
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOPS_RETIRED_LOADS_ALL
+PMC1  MEM_UOPS_RETIRED_STORES_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOPS_RETIRED_LOADS_ALL/MEM_UOPS_RETIRED_STORES_ALL
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/broadwellEP/DIVIDE.txt b/collectors/likwid/groups/broadwellEP/DIVIDE.txt
new file mode 100644
index 0000000..c7c5fb2
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/DIVIDE.txt
@@ -0,0 +1,24 @@
+SHORT Divide unit information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0:EDGEDETECT ARITH_FPU_DIV_ACTIVE
+PMC1 ARITH_FPU_DIV_ACTIVE
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Number of divide ops PMC0:EDGEDETECT
+Avg. divide unit usage duration PMC1/PMC0:EDGEDETECT
+
+LONG
+Formulas:
+Number of divide ops = ARITH_FPU_DIV_ACTIVE:EDGEDETECT
+Avg. divide unit usage duration = ARITH_FPU_DIV_ACTIVE/ARITH_FPU_DIV_ACTIVE:EDGEDETECT
+-
+This performance group measures the average latency of divide operations
diff --git a/collectors/likwid/groups/broadwellEP/ENERGY.txt b/collectors/likwid/groups/broadwellEP/ENERGY.txt
new file mode 100644
index 0000000..fe7829f
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/ENERGY.txt
@@ -0,0 +1,35 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formulas:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket)  and DRAM level.
+
diff --git a/collectors/likwid/groups/broadwellEP/FALSE_SHARE.txt b/collectors/likwid/groups/broadwellEP/FALSE_SHARE.txt
new file mode 100644
index 0000000..602b606
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/FALSE_SHARE.txt
@@ -0,0 +1,30 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM
+PMC1 MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM
+PMC2 MEM_UOPS_RETIRED_LOADS_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local LLC false sharing [MByte] 1.E-06*PMC0*64
+Local LLC false sharing rate PMC0/PMC2
+Remote LLC false sharing [MByte] 1.E-06*PMC1*64
+Remote LLC false sharing rate PMC1/PMC2
+
+LONG
+Formulas:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM/MEM_UOPS_RETIRED_LOADS_ALL
+Remote LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM*64
+Remote LLC false sharing rate = MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory load UOPs as reference.
diff --git a/collectors/likwid/groups/broadwellEP/FLOPS_AVX.txt b/collectors/likwid/groups/broadwellEP/FLOPS_AVX.txt
new file mode 100644
index 0000000..7854608
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/FLOPS_AVX.txt
@@ -0,0 +1,24 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0)/time
+Packed DP [MFLOP/s]  1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formulas:
+Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+-
+FLOP rates of 256 bit packed floating-point instructions
+
diff --git a/collectors/likwid/groups/broadwellEP/FLOPS_DP.txt b/collectors/likwid/groups/broadwellEP/FLOPS_DP.txt
new file mode 100644
index 0000000..348ec76
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/FLOPS_DP.txt
@@ -0,0 +1,31 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)
+-
+AVX/SSE scalar and packed double precision FLOP rates.
+
diff --git a/collectors/likwid/groups/broadwellEP/FLOPS_SP.txt b/collectors/likwid/groups/broadwellEP/FLOPS_SP.txt
new file mode 100644
index 0000000..1d7fd7c
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/FLOPS_SP.txt
@@ -0,0 +1,31 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
+
+LONG
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)
+-
+AVX/SSE scalar and packed single precision FLOP rates.
+
diff --git a/collectors/likwid/groups/broadwellEP/HA.txt b/collectors/likwid/groups/broadwellEP/HA.txt
new file mode 100644
index 0000000..1e5a700
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/HA.txt
@@ -0,0 +1,40 @@
+SHORT Main memory bandwidth in MBytes/s seen from Home agent
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+BBOX0C0 IMC_READS_NORMAL
+BBOX0C1 BYPASS_IMC_TAKEN
+BBOX0C2 IMC_WRITES_ALL
+BBOX1C0 IMC_READS_NORMAL
+BBOX1C1 BYPASS_IMC_TAKEN
+BBOX1C2 IMC_WRITES_ALL
+
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(BBOX0C2+BBOX1C2)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(BBOX0C2+BBOX1C2)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1+BBOX0C2+BBOX1C2)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1+BBOX0C2+BBOX1C2)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_READS_NORMAL)+SUM(BYPASS_IMC_TAKEN))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(IMC_READS_NORMAL)+SUM(BYPASS_IMC_TAKEN))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_WRITES_ALL))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(IMC_WRITES_ALL))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_READS_NORMAL) + SUM(BYPASS_IMC_TAKEN) + SUM(IMC_WRITES_ALL))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(IMC_READS_NORMAL) + SUM(BYPASS_IMC_TAKEN) + SUM(IMC_WRITES_ALL))*64.0
+-
+This group derives the same metrics as the MEM group but use the events of the
+Home Agent, a central unit that is responsible for the protocol side of memory
+interactions.
diff --git a/collectors/likwid/groups/broadwellEP/ICACHE.txt b/collectors/likwid/groups/broadwellEP/ICACHE.txt
new file mode 100644
index 0000000..5f11ad6
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+-
+This group measures some L1 instruction cache metrics.
diff --git a/collectors/likwid/groups/broadwellEP/L2.txt b/collectors/likwid/groups/broadwellEP/L2.txt
new file mode 100644
index 0000000..60c7f79
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/L2.txt
@@ -0,0 +1,37 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line loaded from the L2 to the L2 data cache and the writebacks from
+the L2 data cache to the L2 cache. The group also outputs total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and cache lines transferred it the instruction
+cache.
diff --git a/collectors/likwid/groups/broadwellEP/L2CACHE.txt b/collectors/likwid/groups/broadwellEP/L2CACHE.txt
new file mode 100644
index 0000000..9b5dd4b
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_TRANS_ALL_REQUESTS
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/broadwellEP/L3.txt b/collectors/likwid/groups/broadwellEP/L3.txt
new file mode 100644
index 0000000..98d1d9e
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/L3.txt
@@ -0,0 +1,36 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ALL
+PMC1  L2_TRANS_L2_WB
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/collectors/likwid/groups/broadwellEP/L3CACHE.txt b/collectors/likwid/groups/broadwellEP/L3CACHE.txt
new file mode 100644
index 0000000..f863daa
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/L3CACHE.txt
@@ -0,0 +1,35 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_LOAD_UOPS_RETIRED_L3_ALL
+PMC1  MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate PMC0/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/broadwellEP/MEM.txt b/collectors/likwid/groups/broadwellEP/MEM.txt
new file mode 100644
index 0000000..2a17a2c
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/MEM.txt
@@ -0,0 +1,52 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on a
+per socket base. Some of the counters may not be available on your system.
+Also outputs total data volume transferred from main memory.
+The same metrics are provided by the HA group.
+
diff --git a/collectors/likwid/groups/broadwellEP/MEM_DP.txt b/collectors/likwid/groups/broadwellEP/MEM_DP.txt
new file mode 100644
index 0000000..6078d57
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/MEM_DP.txt
@@ -0,0 +1,73 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX [MFLOP/s] 1.0E-06*(PMC2*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Operational intensity (PMC0*2.0+PMC1+PMC2*4.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0)
+
+LONG
+Formulas:
+Power [W] = PWR_PKG_ENERGY/runtime
+Power DRAM [W] = PWR_DRAM_ENERGY/runtime
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0)
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed double precision FLOP rates. Also reports on packed AVX
+32b instructions.
+The operational intensity is calculated using the FP values of the cores and the
+memory data volume of the whole socket. The actual operational intensity for
+multiple CPUs can be found in the statistics table in the Sum column.
diff --git a/collectors/likwid/groups/broadwellEP/MEM_SP.txt b/collectors/likwid/groups/broadwellEP/MEM_SP.txt
new file mode 100644
index 0000000..d18d2ab
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/MEM_SP.txt
@@ -0,0 +1,73 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX [MFLOP/s] 1.0E-06*(PMC2*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Operational intensity (PMC0*4.0+PMC1+PMC2*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0)
+
+LONG
+Formulas:
+Power [W] = PWR_PKG_ENERGY/runtime
+Power DRAM [W] = PWR_DRAM_ENERGY/runtime
+MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+AVX [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0)
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed single precision FLOP rates. Also reports on packed AVX
+32b instructions.
+The operational intensity is calculated using the FP values of the cores and the
+memory data volume of the whole socket. The actual operational intensity for
+multiple CPUs can be found in the statistics table in the Sum column.
diff --git a/collectors/likwid/groups/broadwellEP/NUMA.txt b/collectors/likwid/groups/broadwellEP/NUMA.txt
new file mode 100644
index 0000000..5b30e25
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/NUMA.txt
@@ -0,0 +1,41 @@
+SHORT Local and remote data transfers
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+BBOX0C0 REQUESTS_READS_LOCAL
+BBOX1C0 REQUESTS_READS_LOCAL
+BBOX0C1 REQUESTS_READS_REMOTE
+BBOX1C1 REQUESTS_READS_REMOTE
+BBOX0C2 REQUESTS_WRITES_LOCAL
+BBOX1C2 REQUESTS_WRITES_LOCAL
+BBOX0C3 REQUESTS_WRITES_REMOTE
+BBOX1C3 REQUESTS_WRITES_REMOTE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local bandwidth [MByte/s]  1.E-06*((BBOX0C0+BBOX1C0+BBOX0C2+BBOX1C2)*64)/time
+Local data volume [GByte]  1.E-09*(BBOX0C0+BBOX1C0+BBOX0C2+BBOX1C2)*64
+Remote bandwidth [MByte/s]  1.E-06*((BBOX0C1+BBOX1C1+BBOX0C3+BBOX1C3)*64)/time
+Remote data volume [GByte]  1.E-09*(BBOX0C1+BBOX1C1+BBOX0C3+BBOX1C3)*64
+Total bandwidth [MByte/s] 1.E-06*((BBOX0C0+BBOX1C0+BBOX0C2+BBOX1C2+BBOX0C1+BBOX1C1+BBOX0C3+BBOX1C3)*64)/time
+Total data volume [GByte] 1.E-09*(BBOX0C0+BBOX1C0+BBOX0C2+BBOX1C2+BBOX0C1+BBOX1C1+BBOX0C3+BBOX1C3)*64
+
+
+LONG
+Formulas:
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+Local bandwidth [MByte/s] = 1.E-06*((SUM(REQUESTS_READS_LOCAL)+SUM(REQUESTS_WRITES_LOCAL))*64)/time
+Local data volume [GByte] = 1.E-09*(SUM(REQUESTS_READS_LOCAL)+SUM(REQUESTS_WRITES_LOCAL))*64
+Remote bandwidth [MByte/s] = 1.E-06*((SUM(REQUESTS_READS_REMOTE)+SUM(REQUESTS_WRITES_REMOTE))*64)/time
+Remote data volume [GByte] = 1.E-09*(SUM(REQUESTS_READS_REMOTE)+SUM(REQUESTS_WRITES_REMOTE))*64
+Total bandwidth [MByte/s] = 1.E-06*((SUM(REQUESTS_READS_LOCAL)+SUM(REQUESTS_WRITES_LOCAL)+SUM(REQUESTS_READS_REMOTE)+SUM(REQUESTS_WRITES_REMOTE))*64)/time
+Total data volume [GByte] = 1.E-09*(SUM(REQUESTS_READS_LOCAL)+SUM(REQUESTS_WRITES_LOCAL)+SUM(REQUESTS_READS_REMOTE)+SUM(REQUESTS_WRITES_REMOTE))*64
+--
+This performance group measures the data traffic of CPU sockets to local and remote
+CPU sockets. It uses the Home Agent for calculation. This may include also data from
+other sources than the memory controllers.
diff --git a/collectors/likwid/groups/broadwellEP/PORT_USAGE.txt b/collectors/likwid/groups/broadwellEP/PORT_USAGE.txt
new file mode 100644
index 0000000..298df1d
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/PORT_USAGE.txt
@@ -0,0 +1,50 @@
+SHORT  Execution port utilization
+
+REQUIRE_NOHT
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_EXECUTED_PORT_PORT_0
+PMC1  UOPS_EXECUTED_PORT_PORT_1
+PMC2  UOPS_EXECUTED_PORT_PORT_2
+PMC3  UOPS_EXECUTED_PORT_PORT_3
+PMC4  UOPS_EXECUTED_PORT_PORT_4
+PMC5  UOPS_EXECUTED_PORT_PORT_5
+PMC6  UOPS_EXECUTED_PORT_PORT_6
+PMC7  UOPS_EXECUTED_PORT_PORT_7
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Port0 usage ratio PMC0/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port1 usage ratio PMC1/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port2 usage ratio PMC2/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port3 usage ratio PMC3/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port4 usage ratio PMC4/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port5 usage ratio PMC5/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port6 usage ratio PMC6/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port7 usage ratio PMC7/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+
+LONG
+Formulas:
+Port0 usage ratio = UOPS_EXECUTED_PORT_PORT_0/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port1 usage ratio = UOPS_EXECUTED_PORT_PORT_1/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port2 usage ratio = UOPS_EXECUTED_PORT_PORT_2/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port3 usage ratio = UOPS_EXECUTED_PORT_PORT_3/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port4 usage ratio = UOPS_EXECUTED_PORT_PORT_4/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port5 usage ratio = UOPS_EXECUTED_PORT_PORT_5/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port6 usage ratio = UOPS_EXECUTED_PORT_PORT_6/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port7 usage ratio = UOPS_EXECUTED_PORT_PORT_7/SUM(UOPS_EXECUTED_PORT_PORT_*)
+-
+This group measures the execution port utilization in a CPU core. The group can
+only be measured when HyperThreading is disabled because only then each CPU core
+can program eight counters.
+Please be aware that the counters PMC4-7 are broken on Intel Broadwell. They
+don't increment if either user- or kernel-level filtering is applied. User-level
+filtering is default in LIKWID, hence kernel-level filtering is added
+automatically for PMC4-7. The returned counts can be much higher.
diff --git a/collectors/likwid/groups/broadwellEP/QPI.txt b/collectors/likwid/groups/broadwellEP/QPI.txt
new file mode 100644
index 0000000..8594706
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/QPI.txt
@@ -0,0 +1,49 @@
+SHORT QPI Link Layer data
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+QBOX0C0 RXL_FLITS_G0_DATA
+QBOX1C0 RXL_FLITS_G0_DATA
+QBOX0C1 RXL_FLITS_G0_NON_DATA
+QBOX1C1 RXL_FLITS_G0_NON_DATA
+QBOX0C2 TXL_FLITS_G0_DATA
+QBOX1C2 TXL_FLITS_G0_DATA
+QBOX0C3 TXL_FLITS_G0_NON_DATA
+QBOX1C3 TXL_FLITS_G0_NON_DATA
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+QPI send data volume [GByte] 1.E-09*(QBOX0C2+QBOX1C2)*8
+QPI send data bandwidth [MByte/s] 1.E-06*(QBOX0C2+QBOX1C2)*8/time
+QPI send link volume [GByte] 1.E-09*(QBOX0C2+QBOX1C2+QBOX0C3+QBOX1C3)*8
+QPI send link bandwidth [MByte/s] 1.E-06*(QBOX0C2+QBOX1C2+QBOX0C3+QBOX1C3)*8/time
+QPI receive data volume [GByte] 1.E-09*(QBOX0C0+QBOX1C0)*8
+QPI receive data bandwidth [MByte/s] 1.E-06*(QBOX0C0+QBOX1C0)*8/time
+QPI receive link volume [GByte] 1.E-09*(QBOX0C0+QBOX1C0+QBOX0C1+QBOX1C1)*8
+QPI receive link bandwidth [MByte/s] 1.E-06*(QBOX0C0+QBOX1C0+QBOX0C1+QBOX1C1)*8/time
+QPI total transfer volume [GByte] 1.E-09*(QBOX0C0+QBOX1C0+QBOX0C2+QBOX1C2+QBOX0C1+QBOX1C1+QBOX0C3+QBOX1C3)*8
+QPI total bandwidth [MByte/s] 1.E-06*(QBOX0C0+QBOX1C0+QBOX0C2+QBOX1C2+QBOX0C1+QBOX1C1+QBOX0C3+QBOX1C3)*8/time
+
+LONG
+Formulas:
+QPI send data volume [GByte] = 1.E-09*(sum(TXL_FLITS_G0_DATA)*8)
+QPI send data bandwidth [MByte/s] = 1.E-06*(sum(TXL_FLITS_G0_DATA)*8)/runtime
+QPI send link volume [GByte] = 1.E-09*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8)
+QPI send link bandwidth [MByte/s] = 1.E-06*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8)/runtime
+QPI receive data volume [GByte] = 1.E-09*(sum(RXL_FLITS_G0_DATA)*8)
+QPI receive data bandwidth [MByte/s] = 1.E-06*(sum(RXL_FLITS_G0_DATA)*8)/runtime
+QPI receive link volume [GByte] = 1.E-09*((sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8)
+QPI receive link bandwidth [MByte/s] = 1.E-06*((sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8)/runtime
+QPI total transfer volume [GByte] = 1.E-09*(sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA)+sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8
+QPI total bandwidth [MByte/s] = 1.E-06*(sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA)+sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8/time
+--
+The Intel QPI Link Layer is responsible for packetizing requests from the caching agent (CBOXes)
+on the way out to the system interface. For Broadwell EP systems, the Link Layer and the
+Ring interface is separated. The QPI link volume contains header, data and trailer while the
+QPI data volume counts only the data flits.
diff --git a/collectors/likwid/groups/broadwellEP/TLB_DATA.txt b/collectors/likwid/groups/broadwellEP/TLB_DATA.txt
new file mode 100644
index 0000000..54f5e05
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1  DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2  DTLB_LOAD_MISSES_WALK_DURATION
+PMC3  DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration PMC2
+L1 DTLB store misses     PMC1
+L1 DTLB store miss rate  PMC1/FIXC0
+L1 DTLB store miss duration PMC3
+
+LONG
+Formulas:
+L1 DTLB load misses     = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate  = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration = DTLB_LOAD_MISSES_WALK_DURATION
+L1 DTLB store misses     = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate  = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration = DTLB_STORE_MISSES_WALK_DURATION
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/broadwellEP/TLB_INSTR.txt b/collectors/likwid/groups/broadwellEP/TLB_INSTR.txt
new file mode 100644
index 0000000..647748f
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_CAUSES_A_WALK
+PMC1  ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration PMC1
+
+
+LONG
+Formulas:
+L1 ITLB misses     = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate  = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration = ITLB_MISSES_WALK_DURATION
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/broadwellEP/TMA.txt b/collectors/likwid/groups/broadwellEP/TMA.txt
new file mode 100644
index 0000000..afb4126
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/TMA.txt
@@ -0,0 +1,48 @@
+SHORT Top down cycle allocation
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_ANY
+PMC1 UOPS_RETIRED_RETIRE_SLOTS
+PMC2 IDQ_UOPS_NOT_DELIVERED_CORE
+PMC3 INT_MISC_RECOVERY_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+IPC FIXC0/FIXC1
+Total Slots 4*FIXC1
+Slots Retired PMC1
+Fetch Bubbles PMC2
+Recovery Bubbles 4*PMC3
+Front End [%] PMC2/(4*FIXC1)*100
+Speculation [%] (PMC0-PMC1+(4*PMC3))/(4*FIXC1)*100
+Retiring [%] PMC1/(4*FIXC1)*100
+Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100
+
+LONG
+Formulas:
+Total Slots = 4*CPU_CLK_UNHALTED_CORE
+Slots Retired = UOPS_RETIRED_RETIRE_SLOTS
+Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE
+Recovery Bubbles = 4*INT_MISC_RECOVERY_CYCLES
+Front End [%] = IDQ_UOPS_NOT_DELIVERED_CORE/(4*CPU_CLK_UNHALTED_CORE)*100
+Speculation [%] = (UOPS_ISSUED_ANY-UOPS_RETIRED_RETIRE_SLOTS+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)*100
+Retiring [%] = UOPS_RETIRED_RETIRE_SLOTS/(4*CPU_CLK_UNHALTED_CORE)*100
+Back End [%] = (1-((IDQ_UOPS_NOT_DELIVERED_CORE+UOPS_ISSUED_ANY+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)))*100
+--
+This performance group measures cycles to determine percentage of time spent in
+front end, back end, retiring and speculation. These metrics are published and
+verified by Intel. Further information:
+Webpage describing Top-Down Method and its usage in Intel vTune:
+https://software.intel.com/en-us/vtune-amplifier-help-tuning-applications-using-a-top-down-microarchitecture-analysis-method
+Paper by Yasin Ahmad:
+https://sites.google.com/site/analysismethods/yasin-pubs/TopDown-Yasin-ISPASS14.pdf?attredirects=0
+Slides by Yasin Ahmad:
+http://www.cs.technion.ac.il/~erangi/TMA_using_Linux_perf__Ahmad_Yasin.pdf
+The performance group was originally published here:
+http://perf.mvermeulen.com/2018/04/14/top-down-performance-counter-analysis-part-1-likwid/
diff --git a/collectors/likwid/groups/broadwellEP/UOPS.txt b/collectors/likwid/groups/broadwellEP/UOPS.txt
new file mode 100644
index 0000000..e6cc208
--- /dev/null
+++ b/collectors/likwid/groups/broadwellEP/UOPS.txt
@@ -0,0 +1,35 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_ANY
+PMC1  UOPS_EXECUTED_THREAD
+PMC2  UOPS_RETIRED_ALL
+PMC3  UOPS_ISSUED_FLAGS_MERGE
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Issued UOPs PMC0
+Merged UOPs PMC3
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formulas:
+Issued UOPs = UOPS_ISSUED_ANY
+Merged UOPs = UOPS_ISSUED_FLAGS_MERGE
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs and returns the number of uOPs which were issued
+but not executed as well as the number of uOPs which were executed but never retired.
+The executed but not retired uOPs commonly come from speculatively executed branches.
+
diff --git a/collectors/likwid/groups/core2/BRANCH.txt b/collectors/likwid/groups/core2/BRANCH.txt
new file mode 100644
index 0000000..3c66c00
--- /dev/null
+++ b/collectors/likwid/groups/core2/BRANCH.txt
@@ -0,0 +1,30 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ANY
+PMC1  BR_INST_RETIRED_MISPRED
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ANY/INSTR_RETIRED_ANY
+Branch misprediction rate = BR_INST_RETIRED_MISPRED/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_INST_RETIRED_MISPRED/BR_INST_RETIRED_ANY
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ANY
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
diff --git a/collectors/likwid/groups/core2/CACHE.txt b/collectors/likwid/groups/core2/CACHE.txt
new file mode 100644
index 0000000..6eda059
--- /dev/null
+++ b/collectors/likwid/groups/core2/CACHE.txt
@@ -0,0 +1,34 @@
+SHORT Data cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPL
+PMC1  L1D_ALL_REF
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+CPI  FIXC1/FIXC0
+data cache misses PMC0
+data cache request rate PMC1/FIXC0
+data cache miss rate PMC0/FIXC0
+data cache miss ratio PMC0/PMC1
+
+LONG
+Formulas:
+data cache request rate =  L1D_ALL_REF / INSTR_RETIRED_ANY
+data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY
+data cache miss ratio =  L1D_REPL / L1D_ALL_REF
+-
+This group measures the locality of your data accesses with regard to the
+L1 cache. Data cache request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The data cache miss rate gives a measure how often it was necessary to get
+cache lines from higher levels of the memory hierarchy. And finally
+data cache miss ratio tells you how many of your memory references required
+a cache line to be loaded from a higher level. While the# data cache miss rate
+might be given by your algorithm you should try to get data cache miss ratio
+as low as possible by increasing your cache reuse.
+
diff --git a/collectors/likwid/groups/core2/CLOCK.txt b/collectors/likwid/groups/core2/CLOCK.txt
new file mode 100644
index 0000000..871c4f9
--- /dev/null
+++ b/collectors/likwid/groups/core2/CLOCK.txt
@@ -0,0 +1,19 @@
+SHORT CPU clock information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+
+LONG
+Formulas:
+CPI = CPU_CLK_UNHALTED_CORE / INSTR_RETIRED_ANY
+-
+Most basic performance group measuring the the clock frequency of the machine.
+
diff --git a/collectors/likwid/groups/core2/DATA.txt b/collectors/likwid/groups/core2/DATA.txt
new file mode 100644
index 0000000..0f5bca5
--- /dev/null
+++ b/collectors/likwid/groups/core2/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  INST_RETIRED_LOADS
+PMC1  INST_RETIRED_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = INST_RETIRED_LOADS/INST_RETIRED_STORES
+-
+This is a simple metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/core2/DIVIDE.txt b/collectors/likwid/groups/core2/DIVIDE.txt
new file mode 100644
index 0000000..0753b4e
--- /dev/null
+++ b/collectors/likwid/groups/core2/DIVIDE.txt
@@ -0,0 +1,24 @@
+SHORT Divide unit information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLES_DIV_BUSY
+PMC1 DIV
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Number of divide ops PMC1
+Avg. divide unit usage duration PMC0/PMC1
+
+LONG
+Formulas:
+Number of divide ops = DIV
+Avg. divide unit usage duration = CYCLES_DIV_BUSY/DIV
+-
+This performance group measures the average latency of divide operations
diff --git a/collectors/likwid/groups/core2/FLOPS_DP.txt b/collectors/likwid/groups/core2/FLOPS_DP.txt
new file mode 100644
index 0000000..e1698ff
--- /dev/null
+++ b/collectors/likwid/groups/core2/FLOPS_DP.txt
@@ -0,0 +1,29 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  SIMD_COMP_INST_RETIRED_PACKED_DOUBLE
+PMC1  SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+CPI  FIXC1/FIXC0
+DP [MFLOP/s]    1.0E-06*(PMC0*2.0+PMC1)/time
+Packed [MUOPS/s] 1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*PMC0/PMC1
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_DOUBLE*2+SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE)/time
+Packed [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_PACKED_DOUBLE/runtime
+Scalar [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE/runtime
+Vectorization ratio = 100*SIMD_COMP_INST_RETIRED_PACKED_DOUBLE/SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE
+-
+Profiling group to measure double SSE FLOPs. Don't forget that your code might also execute X87 FLOPs.
+On the number of SIMD_COMP_INST_RETIRED_PACKED_DOUBLE you can see how well your code was vectorized.
+
+
diff --git a/collectors/likwid/groups/core2/FLOPS_SP.txt b/collectors/likwid/groups/core2/FLOPS_SP.txt
new file mode 100644
index 0000000..a2c842c
--- /dev/null
+++ b/collectors/likwid/groups/core2/FLOPS_SP.txt
@@ -0,0 +1,29 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  SIMD_COMP_INST_RETIRED_PACKED_SINGLE
+PMC1  SIMD_COMP_INST_RETIRED_SCALAR_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+CPI  FIXC1/FIXC0
+SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*PMC0/PMC1
+
+LONG
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(SIMD_COMP_INST_RETIRED_PACKED_SINGLE*4+SIMD_COMP_INST_RETIRED_SCALAR_SINGLE)/time
+Packed [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_PACKED_SINGLE/runtime
+Scalar [MUOPS/s] = 1.0E-06*SIMD_COMP_INST_RETIRED_SCALAR_SINGLE/runtime
+Vectorization ratio [%] = 100*SIMD_COMP_INST_RETIRED_PACKED_SINGLE/SIMD_COMP_INST_RETIRED_SCALAR_SINGLE
+-
+Profiling group to measure single precision SSE FLOPs. Don't forget that your code might also execute X87 FLOPs.
+On the number of SIMD_COMP_INST_RETIRED_PACKED_SINGLE you can see how well your code was vectorized.
+
+
diff --git a/collectors/likwid/groups/core2/FLOPS_X87.txt b/collectors/likwid/groups/core2/FLOPS_X87.txt
new file mode 100644
index 0000000..46309e4
--- /dev/null
+++ b/collectors/likwid/groups/core2/FLOPS_X87.txt
@@ -0,0 +1,21 @@
+SHORT X87 MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  X87_OPS_RETIRED_ANY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+CPI  FIXC1/FIXC0
+X87 [MFLOP/s]  1.0E-06*PMC0/time
+
+LONG
+Formulas:
+X87 [MFLOP/s] = 1.0E-06*X87_OPS_RETIRED_ANY/time
+-
+Profiling group to measure X87 FLOPs. Note that also non computational operations
+are measured by this event.
+
diff --git a/collectors/likwid/groups/core2/L2.txt b/collectors/likwid/groups/core2/L2.txt
new file mode 100644
index 0000000..d8cbe0d
--- /dev/null
+++ b/collectors/likwid/groups/core2/L2.txt
@@ -0,0 +1,35 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPL
+PMC1  L1D_M_EVICT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT)*64.0
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is
+computed by the number of cache line allocated in the L1 and the
+number of modified cache lines evicted from the L1.
+Note that this bandwidth also includes data transfers due to a
+write allocate load on a store miss in L1.
+
diff --git a/collectors/likwid/groups/core2/L2CACHE.txt b/collectors/likwid/groups/core2/L2CACHE.txt
new file mode 100644
index 0000000..d3b8776
--- /dev/null
+++ b/collectors/likwid/groups/core2/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_RQSTS_THIS_CORE_ALL_MESI
+PMC1  L2_RQSTS_SELF_I_STATE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate =  L2_RQSTS_THIS_CORE_ALL_MESI / INSTR_RETIRED_ANY
+L2 miss rate  = L2_RQSTS_SELF_I_STATE / INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_SELF_I_STATE / L2_RQSTS_THIS_CORE_ALL_MESI
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/core2/MEM.txt b/collectors/likwid/groups/core2/MEM.txt
new file mode 100644
index 0000000..f6522ba
--- /dev/null
+++ b/collectors/likwid/groups/core2/MEM.txt
@@ -0,0 +1,23 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BUS_TRANS_MEM_THIS_CORE_THIS_A
+PMC1  BUS_TRANS_WB_THIS_CORE_ALL_A
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+Memory bandwidth [MBytes/s] = 1.0E-06*BUS_TRANS_MEM_THIS_CORE_THIS_A*64/time
+Memory data volume [GBytes] = 1.0E-09*BUS_TRANS_MEM_THIS_CORE_THIS_A*64.0
+-
+Profiling group to measure memory bandwidth drawn by this core.
diff --git a/collectors/likwid/groups/core2/TLB.txt b/collectors/likwid/groups/core2/TLB.txt
new file mode 100644
index 0000000..a46cc4b
--- /dev/null
+++ b/collectors/likwid/groups/core2/TLB.txt
@@ -0,0 +1,29 @@
+SHORT TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_MISSES_ANY
+PMC1  L1D_ALL_REF
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB request rate    PMC1/FIXC0
+DTLB miss rate    PMC0/FIXC0
+L1 DTLB miss ratio   PMC0/PMC1
+
+LONG
+Formulas:
+L1 DTLB request rate =  L1D_ALL_REF / INSTR_RETIRED_ANY
+DTLB miss rate  = DTLB_MISSES_ANY / INSTR_RETIRED_ANY
+L1 DTLB miss ratio  =  DTLB_MISSES_ANY / L1D_ALL_REF
+-
+L1 DTLB request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The DTLB miss  rate gives a measure how often a TLB miss occurred
+per instruction. And finally L1 DTLB  miss ratio tells you how many
+of your memory references required caused a TLB miss on average.
+
diff --git a/collectors/likwid/groups/core2/UOPS.txt b/collectors/likwid/groups/core2/UOPS.txt
new file mode 100644
index 0000000..5d816d8
--- /dev/null
+++ b/collectors/likwid/groups/core2/UOPS.txt
@@ -0,0 +1,26 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  RS_UOPS_DISPATCHED_ALL
+PMC1  UOPS_RETIRED_ANY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Executed UOPs PMC0
+Retired UOPs PMC1
+
+LONG
+Formulas:
+Executed UOPs = RS_UOPS_DISPATCHED_ALL
+Retired UOPs = UOPS_RETIRED_ANY
+-
+Performance group measures the executed and retired micro ops. The difference
+between executed and retired uOPs are the speculatively executed uOPs.
diff --git a/collectors/likwid/groups/core2/UOPS_RETIRE.txt b/collectors/likwid/groups/core2/UOPS_RETIRE.txt
new file mode 100644
index 0000000..be0bf73
--- /dev/null
+++ b/collectors/likwid/groups/core2/UOPS_RETIRE.txt
@@ -0,0 +1,25 @@
+SHORT UOPs retirement
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_RETIRED_USED_CYCLES
+PMC1  UOPS_RETIRED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio PMC0/FIXC1
+Unused cycles ratio PMC1/FIXC1
+
+
+LONG
+Formulas:
+Used cycles ratio = UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio = UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+-
+This performance group returns the ratios of used and unused CPU cycles. Here
+unused cycles are cycles where no operation is performed due to some stall.
diff --git a/collectors/likwid/groups/goldmont/BRANCH.txt b/collectors/likwid/groups/goldmont/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/collectors/likwid/groups/goldmont/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/goldmont/CLOCK.txt b/collectors/likwid/groups/goldmont/CLOCK.txt
new file mode 100644
index 0000000..b2174c8
--- /dev/null
+++ b/collectors/likwid/groups/goldmont/CLOCK.txt
@@ -0,0 +1,23 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+
+LONG
+Formulas:
+Power =  PWR_PKG_ENERGY / time
+-
+Silvermont implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/collectors/likwid/groups/goldmont/DATA.txt b/collectors/likwid/groups/goldmont/DATA.txt
new file mode 100644
index 0000000..61a915b
--- /dev/null
+++ b/collectors/likwid/groups/goldmont/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOPS_RETIRED_ALL_LOADS
+PMC1  MEM_UOPS_RETIRED_ALL_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOPS_RETIRED_ALL_LOADS/MEM_UOPS_RETIRED_ALL_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/goldmont/DIVIDE.txt b/collectors/likwid/groups/goldmont/DIVIDE.txt
new file mode 100644
index 0000000..9fc6702
--- /dev/null
+++ b/collectors/likwid/groups/goldmont/DIVIDE.txt
@@ -0,0 +1,24 @@
+SHORT Divide unit information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLES_DIV_BUSY_ALL
+PMC1 CYCLES_DIV_BUSY_ALL_COUNT
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Number of divide ops PMC1
+Avg. divide unit usage duration PMC0/PMC1
+
+LONG
+Formulas:
+Number of divide ops = CYCLES_DIV_BUSY_ALL_COUNT
+Avg. divide unit usage duration = CYCLES_DIV_BUSY_ALL/CYCLES_DIV_BUSY_ALL_COUNT
+-
+This performance group measures the average latency of divide operations
diff --git a/collectors/likwid/groups/goldmont/ENERGY.txt b/collectors/likwid/groups/goldmont/ENERGY.txt
new file mode 100644
index 0000000..7770534
--- /dev/null
+++ b/collectors/likwid/groups/goldmont/ENERGY.txt
@@ -0,0 +1,33 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
+Energy DRAM [J]  PWR1
+Power DRAM [W] PWR1/time
+
+LONG
+Formulas:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+Goldmont implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/collectors/likwid/groups/goldmont/ICACHE.txt b/collectors/likwid/groups/goldmont/ICACHE.txt
new file mode 100644
index 0000000..5f11ad6
--- /dev/null
+++ b/collectors/likwid/groups/goldmont/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+-
+This group measures some L1 instruction cache metrics.
diff --git a/collectors/likwid/groups/goldmont/L2CACHE.txt b/collectors/likwid/groups/goldmont/L2CACHE.txt
new file mode 100644
index 0000000..32a1545
--- /dev/null
+++ b/collectors/likwid/groups/goldmont/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  LONGEST_LAT_CACHE_REFERENCE
+PMC1  LONGEST_LAT_CACHE_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = LONGEST_LAT_CACHE_REFERENCE/INSTR_RETIRED_ANY
+L2 miss rate = LONGEST_LAT_CACHE_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = LONGEST_LAT_CACHE_MISS/LONGEST_LAT_CACHE_REFERENCE
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache
+reuse.
+
diff --git a/collectors/likwid/groups/goldmont/TLB_DATA.txt b/collectors/likwid/groups/goldmont/TLB_DATA.txt
new file mode 100644
index 0000000..b4679e5
--- /dev/null
+++ b/collectors/likwid/groups/goldmont/TLB_DATA.txt
@@ -0,0 +1,27 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  PAGE_WALKS_D_SIDE_COUNT
+PMC1  PAGE_WALKS_D_SIDE_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB misses     PMC0
+L1 DTLB miss rate  PMC0/FIXC0
+L1 DTLB miss duration [Cyc] PMC1/PMC0
+
+LONG
+Formulas:
+L1 DTLB misses = PAGE_WALKS_D_SIDE_COUNT
+L1 DTLB miss rate = PAGE_WALKS_D_SIDE_COUNT / INSTR_RETIRED_ANY
+L1 DTLB miss duration [Cyc] = PAGE_WALKS_D_SIDE_CYCLES / PAGE_WALKS_D_SIDE_COUNT
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/goldmont/TLB_INSTR.txt b/collectors/likwid/groups/goldmont/TLB_INSTR.txt
new file mode 100644
index 0000000..30dce1e
--- /dev/null
+++ b/collectors/likwid/groups/goldmont/TLB_INSTR.txt
@@ -0,0 +1,27 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  PAGE_WALKS_I_SIDE_COUNT
+PMC1  PAGE_WALKS_I_SIDE_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = PAGE_WALKS_I_SIDE_COUNT
+L1 ITLB miss rate = PAGE_WALKS_I_SIDE_COUNT / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = PAGE_WALKS_I_SIDE_CYCLES / PAGE_WALKS_I_SIDE_COUNT
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
diff --git a/collectors/likwid/groups/haswell/BRANCH.txt b/collectors/likwid/groups/haswell/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/collectors/likwid/groups/haswell/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/haswell/CACHES.txt b/collectors/likwid/groups/haswell/CACHES.txt
new file mode 100644
index 0000000..e39e861
--- /dev/null
+++ b/collectors/likwid/groups/haswell/CACHES.txt
@@ -0,0 +1,71 @@
+SHORT Cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L1D_M_EVICT
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_TRANS_L2_WB
+CBOX0C0 CACHE_LOOKUP_READ_MESI
+CBOX1C0 CACHE_LOOKUP_READ_MESI
+CBOX2C0 CACHE_LOOKUP_READ_MESI
+CBOX3C0 CACHE_LOOKUP_READ_MESI
+CBOX0C1 CACHE_LOOKUP_WRITE_MESI
+CBOX1C1 CACHE_LOOKUP_WRITE_MESI
+CBOX2C1 CACHE_LOOKUP_WRITE_MESI
+CBOX3C1 CACHE_LOOKUP_WRITE_MESI
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L3 to L2 load bandwidth [MBytes/s]  1.0E-06*PMC2*64.0/time
+L3 to L2 load data volume [GBytes]  1.0E-09*PMC2*64.0
+L2 to L3 evict bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time
+L2 to L3 evict data volume [GBytes]  1.0E-09*PMC3*64.0
+L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0)*64.0/time
+System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0)*64.0
+L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1)*64/time
+L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1)*64
+L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1)*64.0/time
+L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1)*64.0
+
+LONG
+Formulas:
+L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
+L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64
+L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
+L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64
+L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
+L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
+L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time
+L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64
+L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time
+L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64
+L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(CACHE_LOOKUP_READ_MESI))*64/time
+System to L3 data volume [GBytes] = 1.0E-09*(SUM(CACHE_LOOKUP_READ_MESI))*64
+L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(CACHE_LOOKUP_WRITE_MESI))*64/time
+L3 to system data volume [GBytes] = 1.0E-09*(SUM(CACHE_LOOKUP_WRITE_MESI))*64
+L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(CACHE_LOOKUP_READ_MESI)+SUM(CACHE_LOOKUP_WRITE_MESI))*64/time
+L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(CACHE_LOOKUP_READ_MESI)+SUM(CACHE_LOOKUP_WRITE_MESI))*64
+-
+Group to measure cache transfers between L1 and Memory. Please notice that the
+L3 to/from system metrics contain any traffic to the system (memory,
+Intel QPI, etc.) but don't seem to handle anything because commonly memory read
+bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth.
+
diff --git a/collectors/likwid/groups/haswell/CLOCK.txt b/collectors/likwid/groups/haswell/CLOCK.txt
new file mode 100644
index 0000000..8055d5b
--- /dev/null
+++ b/collectors/likwid/groups/haswell/CLOCK.txt
@@ -0,0 +1,26 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+UBOXFIX UNCORE_CLOCK
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+Uncore Clock [MHz] 1.E-06*UBOXFIX/time
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+
+LONG
+Formulas:
+Power =  PWR_PKG_ENERGY / time
+Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
+-
+Haswell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/collectors/likwid/groups/haswell/CYCLE_ACTIVITY.txt b/collectors/likwid/groups/haswell/CYCLE_ACTIVITY.txt
new file mode 100644
index 0000000..c432a44
--- /dev/null
+++ b/collectors/likwid/groups/haswell/CYCLE_ACTIVITY.txt
@@ -0,0 +1,38 @@
+SHORT Cycle Activities
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING
+PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING
+PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING
+PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Cycles without execution [%] (PMC3/FIXC1)*100
+Cycles without execution due to L1D [%] (PMC2/FIXC1)*100
+Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
+Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100
+
+LONG
+Formulas:
+Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
+Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
+Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
+Cycles without execution due to memory loads [%] = CYCLE_ACTIVITY_CYCLES_LDM_PENDING/CPU_CLK_UNHALTED_CORE*100
+--
+This performance group measures the cycles while waiting for data from the cache
+and memory hierarchy.
+CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on
+any execution port.
+CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is
+outstanding.
+CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is
+outstanding.
+CYCLE_ACTIVITY_CYCLES_LDM_PENDING: Cycles while memory subsystem has an
+outstanding load.
diff --git a/collectors/likwid/groups/haswell/CYCLE_STALLS.txt b/collectors/likwid/groups/haswell/CYCLE_STALLS.txt
new file mode 100644
index 0000000..795aeb9
--- /dev/null
+++ b/collectors/likwid/groups/haswell/CYCLE_STALLS.txt
@@ -0,0 +1,45 @@
+SHORT Cycle Activities (Stalls)
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING
+PMC1 CYCLE_ACTIVITY_STALLS_LDM_PENDING
+PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING
+PMC3 CYCLE_ACTIVITY_STALLS_TOTAL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Total execution stalls PMC3
+Stalls caused by L1D misses [%] (PMC2/PMC3)*100
+Stalls caused by L2 misses [%] (PMC0/PMC3)*100
+Stalls caused by memory loads [%] (PMC1/PMC3)*100
+Execution stall rate [%] (PMC3/FIXC1)*100
+Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100
+Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
+Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100
+
+LONG
+Formulas:
+Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
+Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Stalls caused by memory loads [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by memory loads rate [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CPU_CLK_UNHALTED_CORE)*100
+--
+This performance group measures the stalls caused by data traffic in the cache
+hierarchy.
+CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls.
+CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand
+load is outstanding.
+CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand
+load is outstanding.
+CYCLE_ACTIVITY_STALLS_LDM_PENDING: Execution stalls while memory subsystem has
+an outstanding load.
diff --git a/collectors/likwid/groups/haswell/DATA.txt b/collectors/likwid/groups/haswell/DATA.txt
new file mode 100644
index 0000000..17948d4
--- /dev/null
+++ b/collectors/likwid/groups/haswell/DATA.txt
@@ -0,0 +1,27 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOPS_RETIRED_LOADS
+PMC1  MEM_UOPS_RETIRED_STORES
+PMC2  UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+Load ratio PMC0/PMC2
+Store ratio PMC1/PMC2
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
+Load ratio = MEM_UOPS_RETIRED_LOADS/UOPS_RETIRED_ALL
+Store ratio = MEM_UOPS_RETIRED_STORES/UOPS_RETIRED_ALL
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/haswell/DIVIDE.txt b/collectors/likwid/groups/haswell/DIVIDE.txt
new file mode 100644
index 0000000..c9690cf
--- /dev/null
+++ b/collectors/likwid/groups/haswell/DIVIDE.txt
@@ -0,0 +1,24 @@
+SHORT Divide unit information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 ARITH_DIVIDER_UOPS
+PMC1 ARITH_DIVIDER_CYCLES
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Number of divide ops PMC0
+Avg. divide unit usage duration PMC1/PMC0
+
+LONG
+Formulas:
+Number of divide ops = ARITH_DIVIDER_UOPS
+Avg. divide unit usage duration = ARITH_DIVIDER_CYCLES/ARITH_DIVIDER_UOPS
+-
+This performance group measures the average latency of divide operations
diff --git a/collectors/likwid/groups/haswell/ENERGY.txt b/collectors/likwid/groups/haswell/ENERGY.txt
new file mode 100644
index 0000000..59242db
--- /dev/null
+++ b/collectors/likwid/groups/haswell/ENERGY.txt
@@ -0,0 +1,39 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR2  PWR_PP1_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
+Energy PP1 [J]  PWR2
+Power PP1 [W] PWR2/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formulas:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power PP1 = PWR_PP1_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+Haswell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) and DRAM level.
+
diff --git a/collectors/likwid/groups/haswell/FALSE_SHARE.txt b/collectors/likwid/groups/haswell/FALSE_SHARE.txt
new file mode 100644
index 0000000..db438a3
--- /dev/null
+++ b/collectors/likwid/groups/haswell/FALSE_SHARE.txt
@@ -0,0 +1,27 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM
+PMC2 MEM_LOAD_UOPS_RETIRED_ALL_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local LLC hit with false sharing [MByte] 1.E-06*PMC0*64
+Local LLC hit with false sharing rate PMC0/PMC2
+
+LONG
+Formulas:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory loads as reference.
+Please keep in mind that the MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM event may
+undercount by as much as 40% (Errata HSD25).
diff --git a/collectors/likwid/groups/haswell/FLOPS_AVX.txt b/collectors/likwid/groups/haswell/FLOPS_AVX.txt
new file mode 100644
index 0000000..15aacb8
--- /dev/null
+++ b/collectors/likwid/groups/haswell/FLOPS_AVX.txt
@@ -0,0 +1,28 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0   AVX_INSTS_CALC
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0)/time
+Packed DP [MFLOP/s]  1.0E-06*(PMC0*4.0)/time
+
+LONG
+Formulas:
+Packed SP [MFLOP/s] = 1.0E-06*(AVX_INSTS_CALC*8)/runtime
+Packed DP [MFLOP/s] = 1.0E-06*(AVX_INSTS_CALC*4)/runtime
+-
+Packed 32b AVX FLOP/s rates. Approximate counts of AVX & AVX2 256-bit instructions.
+May count non-AVX instructions that employ 256-bit operations, including (but
+not necessarily limited to) rep string instructions that use 256-bit loads and
+stores for optimized performance, XSAVE* and XRSTOR*, and operations that
+transition the x87 FPU data registers between x87 and MMX.
+Caution: The event AVX_INSTS_CALC counts the insertf128 instruction often used
+by the Intel C compilers for (unaligned) vector loads.
diff --git a/collectors/likwid/groups/haswell/ICACHE.txt b/collectors/likwid/groups/haswell/ICACHE.txt
new file mode 100644
index 0000000..f1e2335
--- /dev/null
+++ b/collectors/likwid/groups/haswell/ICACHE.txt
@@ -0,0 +1,33 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+PMC2  ICACHE_IFETCH_STALL
+PMC3  ILD_STALL_IQ_FULL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+L1I stalls PMC2
+L1I stall rate PMC2/FIXC0
+L1I queue full stalls PMC3
+L1I queue full stall rate PMC3/FIXC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I stalls = ICACHE_IFETCH_STALL
+L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY
+-
+This group measures some L1 instruction cache metrics.
diff --git a/collectors/likwid/groups/haswell/L2.txt b/collectors/likwid/groups/haswell/L2.txt
new file mode 100644
index 0000000..60c7f79
--- /dev/null
+++ b/collectors/likwid/groups/haswell/L2.txt
@@ -0,0 +1,37 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line loaded from the L2 to the L2 data cache and the writebacks from
+the L2 data cache to the L2 cache. The group also outputs total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and cache lines transferred it the instruction
+cache.
diff --git a/collectors/likwid/groups/haswell/L2CACHE.txt b/collectors/likwid/groups/haswell/L2CACHE.txt
new file mode 100644
index 0000000..9b5dd4b
--- /dev/null
+++ b/collectors/likwid/groups/haswell/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_TRANS_ALL_REQUESTS
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/haswell/L3.txt b/collectors/likwid/groups/haswell/L3.txt
new file mode 100644
index 0000000..f63a918
--- /dev/null
+++ b/collectors/likwid/groups/haswell/L3.txt
@@ -0,0 +1,36 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ALL
+PMC1  L2_TRANS_L2_WB
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and  measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/collectors/likwid/groups/haswell/L3CACHE.txt b/collectors/likwid/groups/haswell/L3CACHE.txt
new file mode 100644
index 0000000..f863daa
--- /dev/null
+++ b/collectors/likwid/groups/haswell/L3CACHE.txt
@@ -0,0 +1,35 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_LOAD_UOPS_RETIRED_L3_ALL
+PMC1  MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate PMC0/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/haswell/MEM.txt b/collectors/likwid/groups/haswell/MEM.txt
new file mode 100644
index 0000000..3a12df7
--- /dev/null
+++ b/collectors/likwid/groups/haswell/MEM.txt
@@ -0,0 +1,36 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+MBOX0C1  DRAM_READS
+MBOX0C2  DRAM_WRITES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory load bandwidth [MBytes/s]  1.0E-06*MBOX0C1*64.0/time
+Memory load data volume [GBytes]  1.0E-09*MBOX0C1*64.0
+Memory evict bandwidth [MBytes/s]  1.0E-06*MBOX0C2*64.0/time
+Memory evict data volume [GBytes]  1.0E-09*MBOX0C2*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX0C2)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX0C2)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and  measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/collectors/likwid/groups/haswell/PORT_USAGE.txt b/collectors/likwid/groups/haswell/PORT_USAGE.txt
new file mode 100644
index 0000000..eb74ffe
--- /dev/null
+++ b/collectors/likwid/groups/haswell/PORT_USAGE.txt
@@ -0,0 +1,46 @@
+SHORT  Execution port utilization
+
+REQUIRE_NOHT
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_EXECUTED_PORT_PORT_0
+PMC1  UOPS_EXECUTED_PORT_PORT_1
+PMC2  UOPS_EXECUTED_PORT_PORT_2
+PMC3  UOPS_EXECUTED_PORT_PORT_3
+PMC4  UOPS_EXECUTED_PORT_PORT_4
+PMC5  UOPS_EXECUTED_PORT_PORT_5
+PMC6  UOPS_EXECUTED_PORT_PORT_6
+PMC7  UOPS_EXECUTED_PORT_PORT_7
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Port0 usage ratio PMC0/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port1 usage ratio PMC1/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port2 usage ratio PMC2/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port3 usage ratio PMC3/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port4 usage ratio PMC4/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port5 usage ratio PMC5/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port6 usage ratio PMC6/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port7 usage ratio PMC7/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+
+LONG
+Formulas:
+Port0 usage ratio = UOPS_EXECUTED_PORT_PORT_0/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port1 usage ratio = UOPS_EXECUTED_PORT_PORT_1/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port2 usage ratio = UOPS_EXECUTED_PORT_PORT_2/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port3 usage ratio = UOPS_EXECUTED_PORT_PORT_3/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port4 usage ratio = UOPS_EXECUTED_PORT_PORT_4/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port5 usage ratio = UOPS_EXECUTED_PORT_PORT_5/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port6 usage ratio = UOPS_EXECUTED_PORT_PORT_6/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port7 usage ratio = UOPS_EXECUTED_PORT_PORT_7/SUM(UOPS_EXECUTED_PORT_PORT_*)
+-
+This group measures the execution port utilization in a CPU core. The group can
+only be measured when HyperThreading is disabled because only then each CPU core
+can program eight counters.
diff --git a/collectors/likwid/groups/haswell/RECOVERY.txt b/collectors/likwid/groups/haswell/RECOVERY.txt
new file mode 100644
index 0000000..7928ee4
--- /dev/null
+++ b/collectors/likwid/groups/haswell/RECOVERY.txt
@@ -0,0 +1,22 @@
+SHORT  Recovery duration
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  INT_MISC_RECOVERY_CYCLES
+PMC1  INT_MISC_RECOVERY_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Avg. recovery duration PMC0/PMC1
+
+LONG
+Formulas:
+Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT
+-
+This group measures the duration of recoveries after SSE exception, memory
+disambiguation, etc...
diff --git a/collectors/likwid/groups/haswell/TLB_DATA.txt b/collectors/likwid/groups/haswell/TLB_DATA.txt
new file mode 100644
index 0000000..8d94e05
--- /dev/null
+++ b/collectors/likwid/groups/haswell/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1  DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2  DTLB_LOAD_MISSES_WALK_DURATION
+PMC3  DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses     PMC1
+L1 DTLB store miss rate  PMC1/FIXC0
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/haswell/TLB_INSTR.txt b/collectors/likwid/groups/haswell/TLB_INSTR.txt
new file mode 100644
index 0000000..235d977
--- /dev/null
+++ b/collectors/likwid/groups/haswell/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_CAUSES_A_WALK
+PMC1  ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/haswell/TMA.txt b/collectors/likwid/groups/haswell/TMA.txt
new file mode 100644
index 0000000..afb4126
--- /dev/null
+++ b/collectors/likwid/groups/haswell/TMA.txt
@@ -0,0 +1,48 @@
+SHORT Top down cycle allocation
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_ANY
+PMC1 UOPS_RETIRED_RETIRE_SLOTS
+PMC2 IDQ_UOPS_NOT_DELIVERED_CORE
+PMC3 INT_MISC_RECOVERY_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+IPC FIXC0/FIXC1
+Total Slots 4*FIXC1
+Slots Retired PMC1
+Fetch Bubbles PMC2
+Recovery Bubbles 4*PMC3
+Front End [%] PMC2/(4*FIXC1)*100
+Speculation [%] (PMC0-PMC1+(4*PMC3))/(4*FIXC1)*100
+Retiring [%] PMC1/(4*FIXC1)*100
+Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100
+
+LONG
+Formulas:
+Total Slots = 4*CPU_CLK_UNHALTED_CORE
+Slots Retired = UOPS_RETIRED_RETIRE_SLOTS
+Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE
+Recovery Bubbles = 4*INT_MISC_RECOVERY_CYCLES
+Front End [%] = IDQ_UOPS_NOT_DELIVERED_CORE/(4*CPU_CLK_UNHALTED_CORE)*100
+Speculation [%] = (UOPS_ISSUED_ANY-UOPS_RETIRED_RETIRE_SLOTS+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)*100
+Retiring [%] = UOPS_RETIRED_RETIRE_SLOTS/(4*CPU_CLK_UNHALTED_CORE)*100
+Back End [%] = (1-((IDQ_UOPS_NOT_DELIVERED_CORE+UOPS_ISSUED_ANY+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)))*100
+--
+This performance group measures cycles to determine percentage of time spent in
+front end, back end, retiring and speculation. These metrics are published and
+verified by Intel. Further information:
+Webpage describing Top-Down Method and its usage in Intel vTune:
+https://software.intel.com/en-us/vtune-amplifier-help-tuning-applications-using-a-top-down-microarchitecture-analysis-method
+Paper by Yasin Ahmad:
+https://sites.google.com/site/analysismethods/yasin-pubs/TopDown-Yasin-ISPASS14.pdf?attredirects=0
+Slides by Yasin Ahmad:
+http://www.cs.technion.ac.il/~erangi/TMA_using_Linux_perf__Ahmad_Yasin.pdf
+The performance group was originally published here:
+http://perf.mvermeulen.com/2018/04/14/top-down-performance-counter-analysis-part-1-likwid/
diff --git a/collectors/likwid/groups/haswell/UOPS.txt b/collectors/likwid/groups/haswell/UOPS.txt
new file mode 100644
index 0000000..e6cc208
--- /dev/null
+++ b/collectors/likwid/groups/haswell/UOPS.txt
@@ -0,0 +1,35 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_ANY
+PMC1  UOPS_EXECUTED_THREAD
+PMC2  UOPS_RETIRED_ALL
+PMC3  UOPS_ISSUED_FLAGS_MERGE
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Issued UOPs PMC0
+Merged UOPs PMC3
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formulas:
+Issued UOPs = UOPS_ISSUED_ANY
+Merged UOPs = UOPS_ISSUED_FLAGS_MERGE
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs and returns the number of uOPs which were issued
+but not executed as well as the number of uOPs which were executed but never retired.
+The executed but not retired uOPs commonly come from speculatively executed branches.
+
diff --git a/collectors/likwid/groups/haswell/UOPS_EXEC.txt b/collectors/likwid/groups/haswell/UOPS_EXEC.txt
new file mode 100644
index 0000000..7042df7
--- /dev/null
+++ b/collectors/likwid/groups/haswell/UOPS_EXEC.txt
@@ -0,0 +1,31 @@
+SHORT UOPs execution
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_EXECUTED_USED_CYCLES
+PMC1  UOPS_EXECUTED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_EXECUTED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the execution stage in the pipeline. Used cycles are all cycles where uOPs are
+executed while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/collectors/likwid/groups/haswell/UOPS_ISSUE.txt b/collectors/likwid/groups/haswell/UOPS_ISSUE.txt
new file mode 100644
index 0000000..9aac923
--- /dev/null
+++ b/collectors/likwid/groups/haswell/UOPS_ISSUE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs issueing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_USED_CYCLES
+PMC1  UOPS_ISSUED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_ISSUED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the issue stage in the pipeline. Used cycles are all cycles where uOPs are
+issued while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/collectors/likwid/groups/haswell/UOPS_RETIRE.txt b/collectors/likwid/groups/haswell/UOPS_RETIRE.txt
new file mode 100644
index 0000000..0f37585
--- /dev/null
+++ b/collectors/likwid/groups/haswell/UOPS_RETIRE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs retirement
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_RETIRED_USED_CYCLES
+PMC1  UOPS_RETIRED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_RETIRED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the retirement stage in the pipeline (re-order buffer). Used cycles are all
+cycles where uOPs are retired while unused cycles refer to pipeline stalls.
+Moreover, the group calculates the average stall duration in cycles.
diff --git a/collectors/likwid/groups/haswellEP/BRANCH.txt b/collectors/likwid/groups/haswellEP/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/haswellEP/CACHES.txt b/collectors/likwid/groups/haswellEP/CACHES.txt
new file mode 100644
index 0000000..295a139
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/CACHES.txt
@@ -0,0 +1,123 @@
+SHORT Cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L1D_M_EVICT
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_TRANS_L2_WB
+CBOX0C0 LLC_LOOKUP_DATA_READ
+CBOX1C0 LLC_LOOKUP_DATA_READ
+CBOX2C0 LLC_LOOKUP_DATA_READ
+CBOX3C0 LLC_LOOKUP_DATA_READ
+CBOX4C0 LLC_LOOKUP_DATA_READ
+CBOX5C0 LLC_LOOKUP_DATA_READ
+CBOX6C0 LLC_LOOKUP_DATA_READ
+CBOX7C0 LLC_LOOKUP_DATA_READ
+CBOX8C0 LLC_LOOKUP_DATA_READ
+CBOX9C0 LLC_LOOKUP_DATA_READ
+CBOX10C0 LLC_LOOKUP_DATA_READ
+CBOX11C0 LLC_LOOKUP_DATA_READ
+CBOX12C0 LLC_LOOKUP_DATA_READ
+CBOX13C0 LLC_LOOKUP_DATA_READ
+CBOX14C0 LLC_LOOKUP_DATA_READ
+CBOX15C0 LLC_LOOKUP_DATA_READ
+CBOX16C0 LLC_LOOKUP_DATA_READ
+CBOX17C0 LLC_LOOKUP_DATA_READ
+CBOX0C1 LLC_VICTIMS_M
+CBOX1C1 LLC_VICTIMS_M
+CBOX2C1 LLC_VICTIMS_M
+CBOX3C1 LLC_VICTIMS_M
+CBOX4C1 LLC_VICTIMS_M
+CBOX5C1 LLC_VICTIMS_M
+CBOX6C1 LLC_VICTIMS_M
+CBOX7C1 LLC_VICTIMS_M
+CBOX8C1 LLC_VICTIMS_M
+CBOX9C1 LLC_VICTIMS_M
+CBOX10C1 LLC_VICTIMS_M
+CBOX11C1 LLC_VICTIMS_M
+CBOX12C1 LLC_VICTIMS_M
+CBOX13C1 LLC_VICTIMS_M
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L3 to L2 load bandwidth [MBytes/s]  1.0E-06*PMC2*64.0/time
+L3 to L2 load data volume [GBytes]  1.0E-09*PMC2*64.0
+L2 to L3 evict bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time
+L2 to L3 evict data volume [GBytes]  1.0E-09*PMC3*64.0
+L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0)*64.0/time
+System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0)*64.0
+L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64/time
+L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64
+L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64.0/time
+L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1)*64.0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
+L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64
+L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
+L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64
+L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
+L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
+L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time
+L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64
+L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time
+L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64
+L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ))*64/time
+System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ))*64
+L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M))*64/time
+L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M))*64
+L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M))*64/time
+L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M))*64
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
+-
+Group to measure cache transfers between L1 and Memory. Please notice that the
+L3 to/from system metrics contain any traffic to the system (memory,
+Intel QPI, etc.) but don't seem to handle anything because commonly memory read
+bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth.
+
diff --git a/collectors/likwid/groups/haswellEP/CBOX.txt b/collectors/likwid/groups/haswellEP/CBOX.txt
new file mode 100644
index 0000000..d9cc13c
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/CBOX.txt
@@ -0,0 +1,61 @@
+SHORT CBOX related data and metrics
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+CBOX0C0 LLC_VICTIMS_M
+CBOX1C0 LLC_VICTIMS_M
+CBOX2C0 LLC_VICTIMS_M
+CBOX3C0 LLC_VICTIMS_M
+CBOX4C0 LLC_VICTIMS_M
+CBOX5C0 LLC_VICTIMS_M
+CBOX6C0 LLC_VICTIMS_M
+CBOX7C0 LLC_VICTIMS_M
+CBOX8C0 LLC_VICTIMS_M
+CBOX9C0 LLC_VICTIMS_M
+CBOX10C0 LLC_VICTIMS_M
+CBOX11C0 LLC_VICTIMS_M
+CBOX12C0 LLC_VICTIMS_M
+CBOX13C0 LLC_VICTIMS_M
+CBOX14C0 LLC_VICTIMS_M
+CBOX15C0 LLC_VICTIMS_M
+CBOX16C0 LLC_VICTIMS_M
+CBOX17C0 LLC_VICTIMS_M
+CBOX0C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX1C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX2C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX3C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX4C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX5C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX6C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX7C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX8C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX9C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX10C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX11C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX12C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX13C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX14C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX15C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX16C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX17C1:STATE=0x1 LLC_LOOKUP_ANY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+LLC misses per instruction (CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0)/FIXC0
+LL2 data written to MEM [MBytes] 1E-6*(CBOX0C1:STATE=0x1+CBOX1C1:STATE=0x1+CBOX2C1:STATE=0x1+CBOX3C1:STATE=0x1+CBOX4C1:STATE=0x1+CBOX5C1:STATE=0x1+CBOX6C1:STATE=0x1+CBOX7C1:STATE=0x1+CBOX8C1:STATE=0x1+CBOX9C1:STATE=0x1+CBOX10C1:STATE=0x1+CBOX11C1:STATE=0x1+CBOX12C1:STATE=0x1+CBOX13C1:STATE=0x1+CBOX14C1:STATE=0x1+CBOX15C1:STATE=0x1+CBOX16C1:STATE=0x1+CBOX17C1:STATE=0x1)*64
+
+
+LONG
+Formulas:
+LLC Misses Per Instruction = sum(LLC_VICTIMS_M)/INSTR_RETIRED_ANY
+LL2 data written to MEM [MBytes] = sum(LLC_LOOKUP_ANY)*64*1E-6
+-
+The CBOXes mediate the traffic from the L2 cache to the segmented L3 cache. Each
+CBOX is responsible for one segment (2.5 MByte). The boxes maintain the coherence between all
+CPU cores of the socket. Depending on the CPU core count, some CBOXes are not attached
+to a 2.5 MByte slice but are still active and track the traffic.
diff --git a/collectors/likwid/groups/haswellEP/CLOCK.txt b/collectors/likwid/groups/haswellEP/CLOCK.txt
new file mode 100644
index 0000000..8055d5b
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/CLOCK.txt
@@ -0,0 +1,26 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+UBOXFIX UNCORE_CLOCK
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+Uncore Clock [MHz] 1.E-06*UBOXFIX/time
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+
+LONG
+Formulas:
+Power =  PWR_PKG_ENERGY / time
+Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
+-
+Haswell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/collectors/likwid/groups/haswellEP/CYCLE_ACTIVITY.txt b/collectors/likwid/groups/haswellEP/CYCLE_ACTIVITY.txt
new file mode 100644
index 0000000..c432a44
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/CYCLE_ACTIVITY.txt
@@ -0,0 +1,38 @@
+SHORT Cycle Activities
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING
+PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING
+PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING
+PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Cycles without execution [%] (PMC3/FIXC1)*100
+Cycles without execution due to L1D [%] (PMC2/FIXC1)*100
+Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
+Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100
+
+LONG
+Formulas:
+Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
+Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
+Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
+Cycles without execution due to memory loads [%] = CYCLE_ACTIVITY_CYCLES_LDM_PENDING/CPU_CLK_UNHALTED_CORE*100
+--
+This performance group measures the cycles while waiting for data from the cache
+and memory hierarchy.
+CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on
+any execution port.
+CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is
+outstanding.
+CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is
+outstanding.
+CYCLE_ACTIVITY_CYCLES_LDM_PENDING: Cycles while memory subsystem has an
+outstanding load.
diff --git a/collectors/likwid/groups/haswellEP/CYCLE_STALLS.txt b/collectors/likwid/groups/haswellEP/CYCLE_STALLS.txt
new file mode 100644
index 0000000..795aeb9
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/CYCLE_STALLS.txt
@@ -0,0 +1,45 @@
+SHORT Cycle Activities (Stalls)
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING
+PMC1 CYCLE_ACTIVITY_STALLS_LDM_PENDING
+PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING
+PMC3 CYCLE_ACTIVITY_STALLS_TOTAL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Total execution stalls PMC3
+Stalls caused by L1D misses [%] (PMC2/PMC3)*100
+Stalls caused by L2 misses [%] (PMC0/PMC3)*100
+Stalls caused by memory loads [%] (PMC1/PMC3)*100
+Execution stall rate [%] (PMC3/FIXC1)*100
+Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100
+Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
+Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100
+
+LONG
+Formulas:
+Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
+Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Stalls caused by memory loads [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by memory loads rate [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CPU_CLK_UNHALTED_CORE)*100
+--
+This performance group measures the stalls caused by data traffic in the cache
+hierarchy.
+CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls.
+CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand
+load is outstanding.
+CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand
+load is outstanding.
+CYCLE_ACTIVITY_STALLS_LDM_PENDING: Execution stalls while memory subsystem has
+an outstanding load.
diff --git a/collectors/likwid/groups/haswellEP/DATA.txt b/collectors/likwid/groups/haswellEP/DATA.txt
new file mode 100644
index 0000000..967cbad
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOPS_RETIRED_LOADS
+PMC1  MEM_UOPS_RETIRED_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/haswellEP/DIVIDE.txt b/collectors/likwid/groups/haswellEP/DIVIDE.txt
new file mode 100644
index 0000000..c9690cf
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/DIVIDE.txt
@@ -0,0 +1,24 @@
+SHORT Divide unit information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 ARITH_DIVIDER_UOPS
+PMC1 ARITH_DIVIDER_CYCLES
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Number of divide ops PMC0
+Avg. divide unit usage duration PMC1/PMC0
+
+LONG
+Formulas:
+Number of divide ops = ARITH_DIVIDER_UOPS
+Avg. divide unit usage duration = ARITH_DIVIDER_CYCLES/ARITH_DIVIDER_UOPS
+-
+This performance group measures the average latency of divide operations
diff --git a/collectors/likwid/groups/haswellEP/ENERGY.txt b/collectors/likwid/groups/haswellEP/ENERGY.txt
new file mode 100644
index 0000000..ee0af1b
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/ENERGY.txt
@@ -0,0 +1,35 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formulas:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+Haswell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket)  and DRAM level.
+
diff --git a/collectors/likwid/groups/haswellEP/FALSE_SHARE.txt b/collectors/likwid/groups/haswellEP/FALSE_SHARE.txt
new file mode 100644
index 0000000..872dbc1
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/FALSE_SHARE.txt
@@ -0,0 +1,34 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM
+PMC1 MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM
+PMC2 MEM_LOAD_UOPS_RETIRED_ALL_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local LLC hit with false sharing [MByte] 1.E-06*PMC0*64
+Local LLC hit with false sharing rate PMC0/PMC2
+Remote LLC false sharing [MByte] 1.E-06*PMC1*64
+Remote LLC false sharing rate PMC1/PMC2
+
+LONG
+Formulas:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+Remote LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM*64
+Remote LLC false sharing rate = MEM_LOAD_UOPS_L3_MISS_RETIRED_REMOTE_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory loads as reference.
+For systems with multiple CPU sockets, this performance group also measures the
+false-sharing of cache lines over socket boundaries.
+Please keep in mind that the MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM event may
+undercount by as much as 40% (Errata HSW150).
diff --git a/collectors/likwid/groups/haswellEP/FLOPS_AVX.txt b/collectors/likwid/groups/haswellEP/FLOPS_AVX.txt
new file mode 100644
index 0000000..15aacb8
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/FLOPS_AVX.txt
@@ -0,0 +1,28 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0   AVX_INSTS_CALC
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0)/time
+Packed DP [MFLOP/s]  1.0E-06*(PMC0*4.0)/time
+
+LONG
+Formulas:
+Packed SP [MFLOP/s] = 1.0E-06*(AVX_INSTS_CALC*8)/runtime
+Packed DP [MFLOP/s] = 1.0E-06*(AVX_INSTS_CALC*4)/runtime
+-
+Packed 32b AVX FLOP/s rates. Approximate counts of AVX & AVX2 256-bit instructions.
+May count non-AVX instructions that employ 256-bit operations, including (but
+not necessarily limited to) rep string instructions that use 256-bit loads and
+stores for optimized performance, XSAVE* and XRSTOR*, and operations that
+transition the x87 FPU data registers between x87 and MMX.
+Caution: The event AVX_INSTS_CALC counts the insertf128 instruction often used
+by the Intel C compilers for (unaligned) vector loads.
diff --git a/collectors/likwid/groups/haswellEP/HA.txt b/collectors/likwid/groups/haswellEP/HA.txt
new file mode 100644
index 0000000..1e5a700
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/HA.txt
@@ -0,0 +1,40 @@
+SHORT Main memory bandwidth in MBytes/s seen from Home agent
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+BBOX0C0 IMC_READS_NORMAL
+BBOX0C1 BYPASS_IMC_TAKEN
+BBOX0C2 IMC_WRITES_ALL
+BBOX1C0 IMC_READS_NORMAL
+BBOX1C1 BYPASS_IMC_TAKEN
+BBOX1C2 IMC_WRITES_ALL
+
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(BBOX0C2+BBOX1C2)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(BBOX0C2+BBOX1C2)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1+BBOX0C2+BBOX1C2)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(BBOX0C0+BBOX1C0+BBOX0C1+BBOX1C1+BBOX0C2+BBOX1C2)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_READS_NORMAL)+SUM(BYPASS_IMC_TAKEN))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(IMC_READS_NORMAL)+SUM(BYPASS_IMC_TAKEN))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_WRITES_ALL))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(IMC_WRITES_ALL))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(IMC_READS_NORMAL) + SUM(BYPASS_IMC_TAKEN) + SUM(IMC_WRITES_ALL))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(IMC_READS_NORMAL) + SUM(BYPASS_IMC_TAKEN) + SUM(IMC_WRITES_ALL))*64.0
+-
+This group derives the same metrics as the MEM group but use the events of the
+Home Agent, a central unit that is responsible for the protocol side of memory
+interactions.
diff --git a/collectors/likwid/groups/haswellEP/ICACHE.txt b/collectors/likwid/groups/haswellEP/ICACHE.txt
new file mode 100644
index 0000000..f1e2335
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/ICACHE.txt
@@ -0,0 +1,33 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+PMC2  ICACHE_IFETCH_STALL
+PMC3  ILD_STALL_IQ_FULL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+L1I stalls PMC2
+L1I stall rate PMC2/FIXC0
+L1I queue full stalls PMC3
+L1I queue full stall rate PMC3/FIXC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I stalls = ICACHE_IFETCH_STALL
+L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY
+-
+This group measures some L1 instruction cache metrics.
diff --git a/collectors/likwid/groups/haswellEP/L2.txt b/collectors/likwid/groups/haswellEP/L2.txt
new file mode 100644
index 0000000..60c7f79
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/L2.txt
@@ -0,0 +1,37 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line loaded from the L2 to the L2 data cache and the writebacks from
+the L2 data cache to the L2 cache. The group also outputs total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and cache lines transferred it the instruction
+cache.
diff --git a/collectors/likwid/groups/haswellEP/L2CACHE.txt b/collectors/likwid/groups/haswellEP/L2CACHE.txt
new file mode 100644
index 0000000..9b5dd4b
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_TRANS_ALL_REQUESTS
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/haswellEP/L3.txt b/collectors/likwid/groups/haswellEP/L3.txt
new file mode 100644
index 0000000..0109db3
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/L3.txt
@@ -0,0 +1,36 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ALL
+PMC1  L2_TRANS_L2_WB
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DEMAND_DIRTY*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and  measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/collectors/likwid/groups/haswellEP/L3CACHE.txt b/collectors/likwid/groups/haswellEP/L3CACHE.txt
new file mode 100644
index 0000000..f863daa
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/L3CACHE.txt
@@ -0,0 +1,35 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_LOAD_UOPS_RETIRED_L3_ALL
+PMC1  MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate PMC0/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/haswellEP/MEM.txt b/collectors/likwid/groups/haswellEP/MEM.txt
new file mode 100644
index 0000000..2a17a2c
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/MEM.txt
@@ -0,0 +1,52 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on a
+per socket base. Some of the counters may not be available on your system.
+Also outputs total data volume transferred from main memory.
+The same metrics are provided by the HA group.
+
diff --git a/collectors/likwid/groups/haswellEP/NUMA.txt b/collectors/likwid/groups/haswellEP/NUMA.txt
new file mode 100644
index 0000000..41fbe62
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/NUMA.txt
@@ -0,0 +1,33 @@
+SHORT Local and remote memory accesses
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 OFFCORE_RESPONSE_0_LOCAL_DRAM
+PMC1 OFFCORE_RESPONSE_1_REMOTE_DRAM
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local DRAM data volume [GByte]  1.E-09*PMC0*64
+Local DRAM bandwidth [MByte/s]  1.E-06*(PMC0*64)/time
+Remote DRAM data volume [GByte]  1.E-09*PMC1*64
+Remote DRAM bandwidth [MByte/s]  1.E-06*(PMC1*64)/time
+Memory data volume [GByte]  1.E-09*(PMC0+PMC1)*64
+Memory bandwidth [MByte/s]  1.E-06*((PMC0+PMC1)*64)/time
+
+LONG
+Formulas:
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+Local DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_0_LOCAL_DRAM*64
+Local DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_0_LOCAL_DRAM*64)/time
+Remote DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_1_REMOTE_DRAM*64
+Remote DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_1_REMOTE_DRAM*64)/time
+Memory data volume [GByte] = 1.E-09*(OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64
+Memory bandwidth [MByte/s] = 1.E-06*((OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64)/time
+--
+This performance group measures the data traffic of CPU cores to local and remote
+memory.
diff --git a/collectors/likwid/groups/haswellEP/PORT_USAGE.txt b/collectors/likwid/groups/haswellEP/PORT_USAGE.txt
new file mode 100644
index 0000000..eb74ffe
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/PORT_USAGE.txt
@@ -0,0 +1,46 @@
+SHORT  Execution port utilization
+
+REQUIRE_NOHT
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_EXECUTED_PORT_PORT_0
+PMC1  UOPS_EXECUTED_PORT_PORT_1
+PMC2  UOPS_EXECUTED_PORT_PORT_2
+PMC3  UOPS_EXECUTED_PORT_PORT_3
+PMC4  UOPS_EXECUTED_PORT_PORT_4
+PMC5  UOPS_EXECUTED_PORT_PORT_5
+PMC6  UOPS_EXECUTED_PORT_PORT_6
+PMC7  UOPS_EXECUTED_PORT_PORT_7
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Port0 usage ratio PMC0/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port1 usage ratio PMC1/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port2 usage ratio PMC2/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port3 usage ratio PMC3/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port4 usage ratio PMC4/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port5 usage ratio PMC5/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port6 usage ratio PMC6/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port7 usage ratio PMC7/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+
+LONG
+Formulas:
+Port0 usage ratio = UOPS_EXECUTED_PORT_PORT_0/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port1 usage ratio = UOPS_EXECUTED_PORT_PORT_1/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port2 usage ratio = UOPS_EXECUTED_PORT_PORT_2/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port3 usage ratio = UOPS_EXECUTED_PORT_PORT_3/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port4 usage ratio = UOPS_EXECUTED_PORT_PORT_4/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port5 usage ratio = UOPS_EXECUTED_PORT_PORT_5/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port6 usage ratio = UOPS_EXECUTED_PORT_PORT_6/SUM(UOPS_EXECUTED_PORT_PORT_*)
+Port7 usage ratio = UOPS_EXECUTED_PORT_PORT_7/SUM(UOPS_EXECUTED_PORT_PORT_*)
+-
+This group measures the execution port utilization in a CPU core. The group can
+only be measured when HyperThreading is disabled because only then each CPU core
+can program eight counters.
diff --git a/collectors/likwid/groups/haswellEP/QPI.txt b/collectors/likwid/groups/haswellEP/QPI.txt
new file mode 100644
index 0000000..dcdda85
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/QPI.txt
@@ -0,0 +1,49 @@
+SHORT QPI Link Layer data
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+QBOX0C0 RXL_FLITS_G0_DATA
+QBOX1C0 RXL_FLITS_G0_DATA
+QBOX0C1 RXL_FLITS_G0_NON_DATA
+QBOX1C1 RXL_FLITS_G0_NON_DATA
+QBOX0C2 TXL_FLITS_G0_DATA
+QBOX1C2 TXL_FLITS_G0_DATA
+QBOX0C3 TXL_FLITS_G0_NON_DATA
+QBOX1C3 TXL_FLITS_G0_NON_DATA
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+QPI send data volume [GByte] 1.E-09*(QBOX0C2+QBOX1C2)*8
+QPI send data bandwidth [MByte/s] 1.E-06*(QBOX0C2+QBOX1C2)*8/time
+QPI send link volume [GByte] 1.E-09*(QBOX0C2+QBOX1C2+QBOX0C3+QBOX1C3)*8
+QPI send link bandwidth [MByte/s] 1.E-06*(QBOX0C2+QBOX1C2+QBOX0C3+QBOX1C3)*8/time
+QPI receive data volume [GByte] 1.E-09*(QBOX0C0+QBOX1C0)*8
+QPI receive data bandwidth [MByte/s] 1.E-06*(QBOX0C0+QBOX1C0)*8/time
+QPI receive link volume [GByte] 1.E-09*(QBOX0C0+QBOX1C0+QBOX0C1+QBOX1C1)*8
+QPI receive link bandwidth [MByte/s] 1.E-06*(QBOX0C0+QBOX1C0+QBOX0C1+QBOX1C1)*8/time
+QPI total transfer volume [GByte] 1.E-09*(QBOX0C0+QBOX1C0+QBOX0C2+QBOX1C2+QBOX0C1+QBOX1C1+QBOX0C3+QBOX1C3)*8
+QPI total bandwidth [MByte/s] 1.E-06*(QBOX0C0+QBOX1C0+QBOX0C2+QBOX1C2+QBOX0C1+QBOX1C1+QBOX0C3+QBOX1C3)*8/time
+
+LONG
+Formulas:
+QPI send data volume [GByte] = 1.E-09*(sum(TXL_FLITS_G0_DATA)*8)
+QPI send data bandwidth [MByte/s] = 1.E-06*(sum(TXL_FLITS_G0_DATA)*8)/runtime
+QPI send link volume [GByte] = 1.E-09*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8)
+QPI send link bandwidth [MByte/s] = 1.E-06*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8)/runtime
+QPI receive data volume [GByte] = 1.E-09*(sum(RXL_FLITS_G0_DATA)*8)
+QPI receive data bandwidth [MByte/s] = 1.E-06*(sum(RXL_FLITS_G0_DATA)*8)/runtime
+QPI receive link volume [GByte] = 1.E-09*((sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8)
+QPI receive link bandwidth [MByte/s] = 1.E-06*((sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8)/runtime
+QPI total transfer volume [GByte] = 1.E-09*(sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA)+sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8
+QPI total bandwidth [MByte/s] = 1.E-06*(sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA)+sum(RXL_FLITS_G0_DATA)+sum(RXL_FLITS_G0_NON_DATA))*8/time
+--
+The Intel QPI Link Layer is responsible for packetizing requests from the caching agent (CBOXes)
+on the way out to the system interface. For Haswell EP systems, the Link Layer and the
+Ring interface is separated. The QPI link volume contains header, data and trailer while the
+QPI data volume counts only the data flits.
diff --git a/collectors/likwid/groups/haswellEP/RECOVERY.txt b/collectors/likwid/groups/haswellEP/RECOVERY.txt
new file mode 100644
index 0000000..7928ee4
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/RECOVERY.txt
@@ -0,0 +1,22 @@
+SHORT  Recovery duration
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  INT_MISC_RECOVERY_CYCLES
+PMC1  INT_MISC_RECOVERY_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Avg. recovery duration PMC0/PMC1
+
+LONG
+Formulas:
+Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT
+-
+This group measures the duration of recoveries after SSE exception, memory
+disambiguation, etc...
diff --git a/collectors/likwid/groups/haswellEP/SBOX.txt b/collectors/likwid/groups/haswellEP/SBOX.txt
new file mode 100644
index 0000000..24f86b6
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/SBOX.txt
@@ -0,0 +1,28 @@
+SHORT Ring Transfer bandwidth
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+SBOX0C0 RING_BL_USED_ANY
+SBOX1C0 RING_BL_USED_ANY
+SBOX2C0 RING_BL_USED_ANY
+SBOX3C0 RING_BL_USED_ANY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Ring transfer bandwidth [MByte/s] 1.E-06*(SBOX0C0+SBOX1C0+SBOX2C0+SBOX3C0)*32/time
+Ring transfer data volume [GByte] 1.E-09*(SBOX0C0+SBOX1C0+SBOX2C0+SBOX3C0)*32
+
+LONG
+Formulas:
+Ring transfer bandwidth [MByte/s] = 1.E-06*(SUM(SBOXxC0)*32)/time
+Ring transfer data volume [GByte] = 1.E-09*(SUM(SBOXxC0)*32)
+--
+The SBOXes manage the transfer between the socket local ring(s). For micro architectures
+prior to Haswell, the SBOX and QBOX was similar as only a single ring was used.
+Haswell systems with a high core count assemble two rings that are connected through
+the SBOXes, the traffic between the sockets is handled by the QBOXes.
diff --git a/collectors/likwid/groups/haswellEP/TLB_DATA.txt b/collectors/likwid/groups/haswellEP/TLB_DATA.txt
new file mode 100644
index 0000000..8d94e05
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1  DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2  DTLB_LOAD_MISSES_WALK_DURATION
+PMC3  DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses     PMC1
+L1 DTLB store miss rate  PMC1/FIXC0
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/haswellEP/TLB_INSTR.txt b/collectors/likwid/groups/haswellEP/TLB_INSTR.txt
new file mode 100644
index 0000000..235d977
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_CAUSES_A_WALK
+PMC1  ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/haswellEP/TMA.txt b/collectors/likwid/groups/haswellEP/TMA.txt
new file mode 100644
index 0000000..afb4126
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/TMA.txt
@@ -0,0 +1,48 @@
+SHORT Top down cycle allocation
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_ANY
+PMC1 UOPS_RETIRED_RETIRE_SLOTS
+PMC2 IDQ_UOPS_NOT_DELIVERED_CORE
+PMC3 INT_MISC_RECOVERY_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+IPC FIXC0/FIXC1
+Total Slots 4*FIXC1
+Slots Retired PMC1
+Fetch Bubbles PMC2
+Recovery Bubbles 4*PMC3
+Front End [%] PMC2/(4*FIXC1)*100
+Speculation [%] (PMC0-PMC1+(4*PMC3))/(4*FIXC1)*100
+Retiring [%] PMC1/(4*FIXC1)*100
+Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100
+
+LONG
+Formulas:
+Total Slots = 4*CPU_CLK_UNHALTED_CORE
+Slots Retired = UOPS_RETIRED_RETIRE_SLOTS
+Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE
+Recovery Bubbles = 4*INT_MISC_RECOVERY_CYCLES
+Front End [%] = IDQ_UOPS_NOT_DELIVERED_CORE/(4*CPU_CLK_UNHALTED_CORE)*100
+Speculation [%] = (UOPS_ISSUED_ANY-UOPS_RETIRED_RETIRE_SLOTS+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)*100
+Retiring [%] = UOPS_RETIRED_RETIRE_SLOTS/(4*CPU_CLK_UNHALTED_CORE)*100
+Back End [%] = (1-((IDQ_UOPS_NOT_DELIVERED_CORE+UOPS_ISSUED_ANY+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)))*100
+--
+This performance group measures cycles to determine percentage of time spent in
+front end, back end, retiring and speculation. These metrics are published and
+verified by Intel. Further information:
+Webpage describing Top-Down Method and its usage in Intel vTune:
+https://software.intel.com/en-us/vtune-amplifier-help-tuning-applications-using-a-top-down-microarchitecture-analysis-method
+Paper by Yasin Ahmad:
+https://sites.google.com/site/analysismethods/yasin-pubs/TopDown-Yasin-ISPASS14.pdf?attredirects=0
+Slides by Yasin Ahmad:
+http://www.cs.technion.ac.il/~erangi/TMA_using_Linux_perf__Ahmad_Yasin.pdf
+The performance group was originally published here:
+http://perf.mvermeulen.com/2018/04/14/top-down-performance-counter-analysis-part-1-likwid/
diff --git a/collectors/likwid/groups/haswellEP/UOPS.txt b/collectors/likwid/groups/haswellEP/UOPS.txt
new file mode 100644
index 0000000..e6cc208
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/UOPS.txt
@@ -0,0 +1,35 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_ANY
+PMC1  UOPS_EXECUTED_THREAD
+PMC2  UOPS_RETIRED_ALL
+PMC3  UOPS_ISSUED_FLAGS_MERGE
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Issued UOPs PMC0
+Merged UOPs PMC3
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formulas:
+Issued UOPs = UOPS_ISSUED_ANY
+Merged UOPs = UOPS_ISSUED_FLAGS_MERGE
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs and returns the number of uOPs which were issued
+but not executed as well as the number of uOPs which were executed but never retired.
+The executed but not retired uOPs commonly come from speculatively executed branches.
+
diff --git a/collectors/likwid/groups/haswellEP/UOPS_EXEC.txt b/collectors/likwid/groups/haswellEP/UOPS_EXEC.txt
new file mode 100644
index 0000000..7042df7
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/UOPS_EXEC.txt
@@ -0,0 +1,31 @@
+SHORT UOPs execution
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_EXECUTED_USED_CYCLES
+PMC1  UOPS_EXECUTED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_EXECUTED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the execution stage in the pipeline. Used cycles are all cycles where uOPs are
+executed while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/collectors/likwid/groups/haswellEP/UOPS_ISSUE.txt b/collectors/likwid/groups/haswellEP/UOPS_ISSUE.txt
new file mode 100644
index 0000000..9aac923
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/UOPS_ISSUE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs issueing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_USED_CYCLES
+PMC1  UOPS_ISSUED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_ISSUED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the issue stage in the pipeline. Used cycles are all cycles where uOPs are
+issued while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/collectors/likwid/groups/haswellEP/UOPS_RETIRE.txt b/collectors/likwid/groups/haswellEP/UOPS_RETIRE.txt
new file mode 100644
index 0000000..0f37585
--- /dev/null
+++ b/collectors/likwid/groups/haswellEP/UOPS_RETIRE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs retirement
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_RETIRED_USED_CYCLES
+PMC1  UOPS_RETIRED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_RETIRED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the retirement stage in the pipeline (re-order buffer). Used cycles are all
+cycles where uOPs are retired while unused cycles refer to pipeline stalls.
+Moreover, the group calculates the average stall duration in cycles.
diff --git a/collectors/likwid/groups/interlagos/BRANCH.txt b/collectors/likwid/groups/interlagos/BRANCH.txt
new file mode 100644
index 0000000..7495b74
--- /dev/null
+++ b/collectors/likwid/groups/interlagos/BRANCH.txt
@@ -0,0 +1,26 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  RETIRED_BRANCH_INSTR
+PMC2  RETIRED_MISPREDICTED_BRANCH_INSTR
+
+METRICS
+Runtime (RDTSC) [s] time
+Branch rate   PMC1/PMC0
+Branch misprediction rate  PMC2/PMC0
+Branch misprediction ratio  PMC2/PMC1
+Instructions per branch  PMC0/PMC1
+
+LONG
+Formulas:
+Branch rate = RETIRED_BRANCH_INSTR/RETIRED_INSTRUCTIONS
+Branch misprediction rate = RETIRED_MISPREDICTED_BRANCH_INSTR/RETIRED_INSTRUCTIONS
+Branch misprediction ratio = RETIRED_MISPREDICTED_BRANCH_INSTR/RETIRED_BRANCH_INSTR
+Instructions per branch = RETIRED_INSTRUCTIONS/RETIRED_BRANCH_INSTR
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/interlagos/CACHE.txt b/collectors/likwid/groups/interlagos/CACHE.txt
new file mode 100644
index 0000000..0d785fc
--- /dev/null
+++ b/collectors/likwid/groups/interlagos/CACHE.txt
@@ -0,0 +1,32 @@
+SHORT Data cache miss rate/ratio
+
+EVENTSET
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  DATA_CACHE_ACCESSES
+PMC2  DATA_CACHE_REFILLS_VALID
+PMC3  DATA_CACHE_MISSES_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+data cache misses PMC3
+data cache request rate PMC1/PMC0
+data cache miss rate (PMC2)/PMC0
+data cache miss ratio (PMC2)/PMC1
+
+LONG
+Formulas:
+data cache misses = DATA_CACHE_MISSES_ALL
+data cache request rate = DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS
+data cache miss rate = (DATA_CACHE_REFILLS_VALID) / RETIRED_INSTRUCTIONS
+data cache miss ratio = (DATA_CACHE_REFILLS_VALID)/DATA_CACHE_ACCESSES
+-
+This group measures the locality of your data accesses with regard to the
+L1 cache. Data cache request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The data cache miss rate gives a measure how often it was necessary to get
+cache lines from higher levels of the memory hierarchy. And finally
+data cache miss ratio tells you how many of your memory references required
+a cache line to be loaded from a higher level. While the# data cache miss rate
+might be given by your algorithm you should try to get data cache miss ratio
+as low as possible by increasing your cache reuse.
+
diff --git a/collectors/likwid/groups/interlagos/CPI.txt b/collectors/likwid/groups/interlagos/CPI.txt
new file mode 100644
index 0000000..c0746e7
--- /dev/null
+++ b/collectors/likwid/groups/interlagos/CPI.txt
@@ -0,0 +1,26 @@
+SHORT  Cycles per instruction
+
+EVENTSET
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  RETIRED_UOPS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]  PMC1*inverseClock
+CPI   PMC1/PMC0
+CPI (based on uops)   PMC1/PMC2
+IPC   PMC0/PMC1
+
+LONG
+Formulas:
+CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
+CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS
+IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED
+-
+This group measures how efficient the processor works with
+regard to instruction throughput. Also important as a standalone
+metric is RETIRED_INSTRUCTIONS as it tells you how many instruction
+you need to execute for a task. An optimization might show very
+low CPI values but execute many more instruction for it.
+
diff --git a/collectors/likwid/groups/interlagos/DATA.txt b/collectors/likwid/groups/interlagos/DATA.txt
new file mode 100644
index 0000000..75f1f60
--- /dev/null
+++ b/collectors/likwid/groups/interlagos/DATA.txt
@@ -0,0 +1,16 @@
+SHORT Load to store ratio
+
+EVENTSET
+PMC0  LS_DISPATCH_LOADS
+PMC1  LS_DISPATCH_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = LS_DISPATCH_LOADS/LS_DISPATCH_STORES
+-
+This is a simple metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/interlagos/FLOPS_DP.txt b/collectors/likwid/groups/interlagos/FLOPS_DP.txt
new file mode 100644
index 0000000..7af248c
--- /dev/null
+++ b/collectors/likwid/groups/interlagos/FLOPS_DP.txt
@@ -0,0 +1,23 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  RETIRED_UOPS
+PMC3  RETIRED_FLOPS_DOUBLE_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]  PMC1*inverseClock
+DP [MFLOP/s]    1.0E-06*(PMC3)/time
+CPI   PMC1/PMC0
+CPI (based on uops)   PMC1/PMC2
+IPC   PMC0/PMC1
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(RETIRED_FLOPS_DOUBLE_ALL)/time
+-
+Profiling group to measure double precisision FLOP rate.
+
+
diff --git a/collectors/likwid/groups/interlagos/FLOPS_SP.txt b/collectors/likwid/groups/interlagos/FLOPS_SP.txt
new file mode 100644
index 0000000..14af2c2
--- /dev/null
+++ b/collectors/likwid/groups/interlagos/FLOPS_SP.txt
@@ -0,0 +1,23 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  RETIRED_UOPS
+PMC3  RETIRED_FLOPS_SINGLE_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]  PMC1*inverseClock
+SP [MFLOP/s]    1.0E-06*(PMC3)/time
+CPI   PMC1/PMC0
+CPI (based on uops)   PMC1/PMC2
+IPC   PMC0/PMC1
+
+LONG
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(RETIRED_FLOPS_SINGLE_ALL)/time
+-
+Profiling group to measure single precision FLOP rate.
+
+
diff --git a/collectors/likwid/groups/interlagos/FPU_EXCEPTION.txt b/collectors/likwid/groups/interlagos/FPU_EXCEPTION.txt
new file mode 100644
index 0000000..0969ae1
--- /dev/null
+++ b/collectors/likwid/groups/interlagos/FPU_EXCEPTION.txt
@@ -0,0 +1,21 @@
+SHORT   Floating point exceptions
+
+EVENTSET
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  RETIRED_FP_INSTRUCTIONS_ALL
+PMC2  FPU_EXCEPTION_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Overall FP exception rate  PMC2/PMC0
+FP exception rate    PMC2/PMC1
+
+LONG
+Formulas:
+Overall FP exception rate = FPU_EXCEPTIONS_ALL / INSTRUCTIONS_RETIRED
+FP exception rate = FPU_EXCEPTIONS_ALL / FP_INSTRUCTIONS_RETIRED_ALL
+-
+Floating point exceptions occur e.g. on the treatment of denormal numbers.
+There might be a large penalty if there are too many floating point
+exceptions.
+
diff --git a/collectors/likwid/groups/interlagos/ICACHE.txt b/collectors/likwid/groups/interlagos/ICACHE.txt
new file mode 100644
index 0000000..62b91d6
--- /dev/null
+++ b/collectors/likwid/groups/interlagos/ICACHE.txt
@@ -0,0 +1,23 @@
+SHORT Instruction cache miss rate/ratio
+
+EVENTSET
+PMC0  INSTRUCTION_CACHE_FETCHES
+PMC1  INSTRUCTION_CACHE_L2_REFILLS
+PMC2  INSTRUCTION_CACHE_SYSTEM_REFILLS
+PMC3  RETIRED_INSTRUCTIONS
+
+METRICS
+Runtime (RDTSC) [s] time
+L1I request rate   PMC0/PMC3
+L1I miss rate    (PMC1+PMC2)/PMC3
+L1I miss ratio   (PMC1+PMC2)/PMC0
+
+LONG
+Formulas:
+L1I request rate = INSTRUCTION_CACHE_FETCHES / RETIRED_INSTRUCTIONS
+L1I miss rate = (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/RETIRED_INSTRUCTIONS
+L1I miss ratio = (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/INSTRUCTION_CACHE_FETCHES
+-
+This group measures the locality of your instruction code with regard to the
+L1 I-Cache.
+
diff --git a/collectors/likwid/groups/interlagos/L2.txt b/collectors/likwid/groups/interlagos/L2.txt
new file mode 100644
index 0000000..4d90ef8
--- /dev/null
+++ b/collectors/likwid/groups/interlagos/L2.txt
@@ -0,0 +1,29 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  DATA_CACHE_REFILLS_ALL
+PMC1  DATA_CACHE_REFILLS_SYSTEM
+PMC2  CPU_CLOCKS_UNHALTED
+
+METRICS
+Runtime (RDTSC) [s] time
+L2 bandwidth [MBytes/s]   1.0E-06*(PMC0-PMC1)*64.0/time
+L2 data volume [GBytes]   1.0E-09*(PMC0-PMC1)*64.0
+Cache refill bandwidth System/L2 [MBytes/s]   1.0E-06*PMC0*64.0/time
+Cache refill bandwidth System  [MBytes/s]    1.0E-06*PMC1*64.0/time
+
+LONG
+Formulas:
+L2 bandwidth [MBytes/s]   = 1.0E-06*(DATA_CACHE_REFILLS_ALL-DATA_CACHE_REFILLS_SYSTEM)*64/time
+L2 data volume [GBytes]   = 1.0E-09*(DATA_CACHE_REFILLS_ALL-DATA_CACHE_REFILLS_SYSTEM)*64
+Cache refill bandwidth system/L2 [MBytes/s]   = 1.0E-06*DATA_CACHE_REFILLS_ALL*64/time
+Cache refill bandwidth system [MBytes/s]   = 1.0E-06*DATA_CACHE_REFILLS_SYSTEM*64/time
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is
+computed by the number of cache line loaded from L2 to L1 and the
+number of modified cache lines evicted from the L1.
+Note that this bandwidth also included data transfers due to a
+write allocate load on a store miss in L1 and copy back transfers if
+originated from L2. L2-L2 data volume is the total data volume transferred
+between L2 and L1.
+
diff --git a/collectors/likwid/groups/interlagos/L2CACHE.txt b/collectors/likwid/groups/interlagos/L2CACHE.txt
new file mode 100644
index 0000000..49b9555
--- /dev/null
+++ b/collectors/likwid/groups/interlagos/L2CACHE.txt
@@ -0,0 +1,31 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  REQUESTS_TO_L2_DC_FILL
+PMC2  L2_CACHE_MISS_DC_FILL
+
+METRICS
+Runtime (RDTSC) [s] time
+L2 request rate   PMC1/PMC0
+L2 miss rate   PMC2/PMC0
+L2 miss ratio   PMC2/PMC1
+
+LONG
+Formulas:
+L2 request rate = L2_REQUESTS_ALL/INSTRUCTIONS_RETIRED
+L2 miss rate  = L2_MISSES_ALL/INSTRUCTIONS_RETIRED
+L2 miss ratio = L2_MISSES_ALL/L2_REQUESTS_ALL
+-
+This group measures the locality of your data accesses with regard to the L2
+Cache. L2 request rate tells you how data intensive your code is or how many
+data accesses you have on average per instruction.  The L2 miss rate gives a
+measure how often it was necessary to get cache lines from memory. And finally
+L2 miss ratio tells you how many of your memory references required a cache line
+to be loaded from a higher level.  While the# data cache miss rate might be
+given by your algorithm you should try to get data cache miss ratio as low as
+possible by increasing your cache reuse.  This group is inspired from the
+whitepaper -Basic Performance Measurements for AMD Athlon 64, AMD Opteron and
+AMD Phenom Processors- from Paul J. Drongowski.
+
+
diff --git a/collectors/likwid/groups/interlagos/L3.txt b/collectors/likwid/groups/interlagos/L3.txt
new file mode 100644
index 0000000..5c9ea4d
--- /dev/null
+++ b/collectors/likwid/groups/interlagos/L3.txt
@@ -0,0 +1,29 @@
+SHORT L3 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  L2_FILL_WB_FILL
+PMC1  L2_FILL_WB_WB
+PMC2  CPU_CLOCKS_UNHALTED
+
+METRICS
+Runtime (RDTSC) [s] time
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_FILL_WB_FILL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_FILL_WB_FILL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_FILL_WB_WB*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_FILL_WB_WB*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_FILL_WB_FILL+L2_FILL_WB_WB)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_FILL_WB_FILL+L2_FILL_WB_WB)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is
+computed by the number of cache line loaded from L3 to L2 and the
+number of modified cache lines evicted from the L2.
+
diff --git a/collectors/likwid/groups/interlagos/L3CACHE.txt b/collectors/likwid/groups/interlagos/L3CACHE.txt
new file mode 100644
index 0000000..5a442c6
--- /dev/null
+++ b/collectors/likwid/groups/interlagos/L3CACHE.txt
@@ -0,0 +1,35 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+PMC0  RETIRED_INSTRUCTIONS
+UPMC0  UNC_READ_REQ_TO_L3_ALL
+UPMC1  UNC_L3_CACHE_MISS_ALL
+UPMC2  UNC_L3_LATENCY_CYCLE_COUNT
+UPMC3  UNC_L3_LATENCY_REQUEST_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+L3 request rate   UPMC0/PMC0
+L3 miss rate   UPMC1/PMC0
+L3 miss ratio   UPMC1/UPMC0
+L3 average access latency [cycles]  UPMC2/UPMC3
+
+LONG
+Formulas:
+L3 request rate = UNC_READ_REQ_TO_L3_ALL/INSTRUCTIONS_RETIRED
+L3 miss rate = UNC_L3_CACHE_MISS_ALL/INSTRUCTIONS_RETIRED
+L3 miss ratio = UNC_L3_CACHE_MISS_ALL/UNC_READ_REQ_TO_L3_ALL
+L3 average access latency =  UNC_L3_LATENCY_CYCLE_COUNT/UNC_L3_LATENCY_REQUEST_COUNT
+-
+This group measures the locality of your data accesses with regard to the L3
+Cache. L3 request rate tells you how data intensive your code is or how many
+data accesses you have on average per instruction.  The L3 miss rate gives a
+measure how often it was necessary to get cache lines from memory. And finally
+L3 miss ratio tells you how many of your memory references required a cache line
+to be loaded from a higher level.  While the# data cache miss rate might be
+given by your algorithm you should try to get data cache miss ratio as low as
+possible by increasing your cache reuse.  This group was inspired from the
+whitepaper - Basic Performance Measurements for AMD Athlon 64, AMD Opteron and
+AMD Phenom Processors - from Paul J. Drongowski.
+
+
diff --git a/collectors/likwid/groups/interlagos/LINKS.txt b/collectors/likwid/groups/interlagos/LINKS.txt
new file mode 100644
index 0000000..dbf3cd0
--- /dev/null
+++ b/collectors/likwid/groups/interlagos/LINKS.txt
@@ -0,0 +1,26 @@
+SHORT Bandwidth on the Hypertransport links
+
+EVENTSET
+UPMC0  UNC_LINK_TRANSMIT_BW_L0_USE
+UPMC1  UNC_LINK_TRANSMIT_BW_L1_USE
+UPMC2  UNC_LINK_TRANSMIT_BW_L2_USE
+UPMC3  UNC_LINK_TRANSMIT_BW_L3_USE
+
+METRICS
+Runtime (RDTSC) [s] time
+Link bandwidth L0 [MBytes/s]  1.0E-06*UPMC0*4.0/time
+Link bandwidth L1 [MBytes/s]  1.0E-06*UPMC1*4.0/time
+Link bandwidth L2 [MBytes/s]  1.0E-06*UPMC2*4.0/time
+Link bandwidth L3 [MBytes/s]  1.0E-06*UPMC3*4.0/time
+
+LONG
+Formulas:
+Link bandwidth L0 [MBytes/s]  = 1.0E-06*UNC_LINK_TRANSMIT_BW_L0_USE*4.0/time
+Link bandwidth L1 [MBytes/s]  = 1.0E-06*UNC_LINK_TRANSMIT_BW_L1_USE*4.0/time
+Link bandwidth L2 [MBytes/s]  = 1.0E-06*UNC_LINK_TRANSMIT_BW_L2_USE*4.0/time
+Link bandwidth L3 [MBytes/s]  = 1.0E-06*UNC_LINK_TRANSMIT_BW_L3_USE*4.0/time
+-
+Profiling group to measure the HyperTransport link bandwidth for the four links
+of a local node. This indicates the# data flow between different ccNUMA nodes.
+
+
diff --git a/collectors/likwid/groups/interlagos/MEM.txt b/collectors/likwid/groups/interlagos/MEM.txt
new file mode 100644
index 0000000..2fa9dfe
--- /dev/null
+++ b/collectors/likwid/groups/interlagos/MEM.txt
@@ -0,0 +1,20 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+UPMC0  UNC_DRAM_ACCESSES_DCT0_ALL
+UPMC1  UNC_DRAM_ACCESSES_DCT1_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Memory bandwidth [MBytes/s]   1.0E-06*(UPMC0+UPMC1)*64.0/time
+Memory data volume [GBytes]   1.0E-09*(UPMC0+UPMC1)*64.0
+
+LONG
+Formulas:
+Memory bandwidth [MBytes/s] = 1.0E-06*(DRAM_ACCESSES_DCTO_ALL+DRAM_ACCESSES_DCT1_ALL)*64/time
+Memory data volume [GBytes] = 1.0E-09*(DRAM_ACCESSES_DCTO_ALL+DRAM_ACCESSES_DCT1_ALL)*64
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Note: As this group measures the accesses from all cores it only makes sense
+to measure with one core per socket, similar as with the Intel Nehalem Uncore events.
+
diff --git a/collectors/likwid/groups/interlagos/NUMA.txt b/collectors/likwid/groups/interlagos/NUMA.txt
new file mode 100644
index 0000000..79f3618
--- /dev/null
+++ b/collectors/likwid/groups/interlagos/NUMA.txt
@@ -0,0 +1,28 @@
+SHORT Read/Write Events between the ccNUMA nodes
+
+EVENTSET
+UPMC0  UNC_CPU_TO_DRAM_LOCAL_TO_0
+UPMC1  UNC_CPU_TO_DRAM_LOCAL_TO_1
+UPMC2  UNC_CPU_TO_DRAM_LOCAL_TO_2
+UPMC3  UNC_CPU_TO_DRAM_LOCAL_TO_3
+
+METRICS
+Runtime (RDTSC) [s] time
+DRAM read/write local to 0 [MegaEvents/s]  1.0E-06*UPMC0/time
+DRAM read/write local to 1 [MegaEvents/s]  1.0E-06*UPMC1/time
+DRAM read/write local to 2 [MegaEvents/s]  1.0E-06*UPMC2/time
+DRAM read/write local to 3 [MegaEvents/s]  1.0E-06*UPMC3/time
+
+LONG
+Formulas:
+DRAM read/write local to 0 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_0/time
+DRAM read/write local to 1 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_1/time
+DRAM read/write local to 2 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_2/time
+DRAM read/write local to 3 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_3/time
+-
+Profiling group to measure the traffic from local CPU to the different
+DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded
+code. You must first determine on which memory domains your code is running.
+A code should only have significant traffic to its own memory domain.
+
+
diff --git a/collectors/likwid/groups/interlagos/NUMA_0_3.txt b/collectors/likwid/groups/interlagos/NUMA_0_3.txt
new file mode 100644
index 0000000..79f3618
--- /dev/null
+++ b/collectors/likwid/groups/interlagos/NUMA_0_3.txt
@@ -0,0 +1,28 @@
+SHORT Read/Write Events between the ccNUMA nodes
+
+EVENTSET
+UPMC0  UNC_CPU_TO_DRAM_LOCAL_TO_0
+UPMC1  UNC_CPU_TO_DRAM_LOCAL_TO_1
+UPMC2  UNC_CPU_TO_DRAM_LOCAL_TO_2
+UPMC3  UNC_CPU_TO_DRAM_LOCAL_TO_3
+
+METRICS
+Runtime (RDTSC) [s] time
+DRAM read/write local to 0 [MegaEvents/s]  1.0E-06*UPMC0/time
+DRAM read/write local to 1 [MegaEvents/s]  1.0E-06*UPMC1/time
+DRAM read/write local to 2 [MegaEvents/s]  1.0E-06*UPMC2/time
+DRAM read/write local to 3 [MegaEvents/s]  1.0E-06*UPMC3/time
+
+LONG
+Formulas:
+DRAM read/write local to 0 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_0/time
+DRAM read/write local to 1 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_1/time
+DRAM read/write local to 2 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_2/time
+DRAM read/write local to 3 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_3/time
+-
+Profiling group to measure the traffic from local CPU to the different
+DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded
+code. You must first determine on which memory domains your code is running.
+A code should only have significant traffic to its own memory domain.
+
+
diff --git a/collectors/likwid/groups/interlagos/NUMA_4_7.txt b/collectors/likwid/groups/interlagos/NUMA_4_7.txt
new file mode 100644
index 0000000..0e05776
--- /dev/null
+++ b/collectors/likwid/groups/interlagos/NUMA_4_7.txt
@@ -0,0 +1,28 @@
+SHORT Read/Write Events between the ccNUMA nodes
+
+EVENTSET
+UPMC0  UNC_CPU_TO_DRAM_LOCAL_TO_4
+UPMC1  UNC_CPU_TO_DRAM_LOCAL_TO_5
+UPMC2  UNC_CPU_TO_DRAM_LOCAL_TO_6
+UPMC3  UNC_CPU_TO_DRAM_LOCAL_TO_7
+
+METRICS
+Runtime (RDTSC) [s] time
+DRAM read/write local to 4 [MegaEvents/s]  1.0E-06*UPMC0/time
+DRAM read/write local to 5 [MegaEvents/s]  1.0E-06*UPMC1/time
+DRAM read/write local to 6 [MegaEvents/s]  1.0E-06*UPMC2/time
+DRAM read/write local to 7 [MegaEvents/s]  1.0E-06*UPMC3/time
+
+LONG
+Formulas:
+DRAM read/write local to 4 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_0/time
+DRAM read/write local to 5 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_1/time
+DRAM read/write local to 6 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_2/time
+DRAM read/write local to 7 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_3/time
+-
+Profiling group to measure the traffic from local CPU to the different
+DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded
+code. You must first determine on which memory domains your code is running.
+A code should only have significant traffic to its own memory domain.
+
+
diff --git a/collectors/likwid/groups/ivybridge/BRANCH.txt b/collectors/likwid/groups/ivybridge/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/collectors/likwid/groups/ivybridge/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/ivybridge/CLOCK.txt b/collectors/likwid/groups/ivybridge/CLOCK.txt
new file mode 100644
index 0000000..fb19101
--- /dev/null
+++ b/collectors/likwid/groups/ivybridge/CLOCK.txt
@@ -0,0 +1,26 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+UBOXFIX UNCORE_CLOCK
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+Uncore Clock [MHz] 1.E-06*UBOXFIX/time
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+
+LONG
+Formulas:
+Power =  PWR_PKG_ENERGY / time
+Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
+-
+IvyBridge implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/collectors/likwid/groups/ivybridge/CYCLE_ACTIVITY.txt b/collectors/likwid/groups/ivybridge/CYCLE_ACTIVITY.txt
new file mode 100644
index 0000000..c432a44
--- /dev/null
+++ b/collectors/likwid/groups/ivybridge/CYCLE_ACTIVITY.txt
@@ -0,0 +1,38 @@
+SHORT Cycle Activities
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING
+PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING
+PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING
+PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Cycles without execution [%] (PMC3/FIXC1)*100
+Cycles without execution due to L1D [%] (PMC2/FIXC1)*100
+Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
+Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100
+
+LONG
+Formulas:
+Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
+Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
+Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
+Cycles without execution due to memory loads [%] = CYCLE_ACTIVITY_CYCLES_LDM_PENDING/CPU_CLK_UNHALTED_CORE*100
+--
+This performance group measures the cycles while waiting for data from the cache
+and memory hierarchy.
+CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on
+any execution port.
+CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is
+outstanding.
+CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is
+outstanding.
+CYCLE_ACTIVITY_CYCLES_LDM_PENDING: Cycles while memory subsystem has an
+outstanding load.
diff --git a/collectors/likwid/groups/ivybridge/CYCLE_STALLS.txt b/collectors/likwid/groups/ivybridge/CYCLE_STALLS.txt
new file mode 100644
index 0000000..795aeb9
--- /dev/null
+++ b/collectors/likwid/groups/ivybridge/CYCLE_STALLS.txt
@@ -0,0 +1,45 @@
+SHORT Cycle Activities (Stalls)
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING
+PMC1 CYCLE_ACTIVITY_STALLS_LDM_PENDING
+PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING
+PMC3 CYCLE_ACTIVITY_STALLS_TOTAL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Total execution stalls PMC3
+Stalls caused by L1D misses [%] (PMC2/PMC3)*100
+Stalls caused by L2 misses [%] (PMC0/PMC3)*100
+Stalls caused by memory loads [%] (PMC1/PMC3)*100
+Execution stall rate [%] (PMC3/FIXC1)*100
+Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100
+Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
+Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100
+
+LONG
+Formulas:
+Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
+Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Stalls caused by memory loads [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by memory loads rate [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CPU_CLK_UNHALTED_CORE)*100
+--
+This performance group measures the stalls caused by data traffic in the cache
+hierarchy.
+CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls.
+CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand
+load is outstanding.
+CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand
+load is outstanding.
+CYCLE_ACTIVITY_STALLS_LDM_PENDING: Execution stalls while memory subsystem has
+an outstanding load.
diff --git a/collectors/likwid/groups/ivybridge/DATA.txt b/collectors/likwid/groups/ivybridge/DATA.txt
new file mode 100644
index 0000000..967cbad
--- /dev/null
+++ b/collectors/likwid/groups/ivybridge/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOPS_RETIRED_LOADS
+PMC1  MEM_UOPS_RETIRED_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/ivybridge/DIVIDE.txt b/collectors/likwid/groups/ivybridge/DIVIDE.txt
new file mode 100644
index 0000000..f8cb0b3
--- /dev/null
+++ b/collectors/likwid/groups/ivybridge/DIVIDE.txt
@@ -0,0 +1,24 @@
+SHORT Divide unit information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 ARITH_NUM_DIV
+PMC1 ARITH_FPU_DIV_ACTIVE
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Number of divide ops PMC0
+Avg. divide unit usage duration PMC1/PMC0
+
+LONG
+Formulas:
+Number of divide ops = ARITH_NUM_DIV
+Avg. divide unit usage duration = ARITH_FPU_DIV_ACTIVE/ARITH_NUM_DIV
+-
+This performance group measures the average latency of divide operations
diff --git a/collectors/likwid/groups/ivybridge/ENERGY.txt b/collectors/likwid/groups/ivybridge/ENERGY.txt
new file mode 100644
index 0000000..92a6915
--- /dev/null
+++ b/collectors/likwid/groups/ivybridge/ENERGY.txt
@@ -0,0 +1,37 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR2  PWR_PP1_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
+Energy PP1 [J]  PWR2
+Power PP1 [W] PWR2/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formulas:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power PP1 = PWR_PP1_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+IvyBridge implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket), the PP0 domain
+and DRAM level. The PP0 domain often refers to only the CPU cores.
diff --git a/collectors/likwid/groups/ivybridge/FALSE_SHARE.txt b/collectors/likwid/groups/ivybridge/FALSE_SHARE.txt
new file mode 100644
index 0000000..fbec3f4
--- /dev/null
+++ b/collectors/likwid/groups/ivybridge/FALSE_SHARE.txt
@@ -0,0 +1,25 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM
+PMC2 MEM_LOAD_UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local LLC false sharing [MByte] 1.E-06*PMC0*64
+Local LLC false sharing rate PMC0/PMC2
+
+LONG
+Formulas:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory loads as reference.
diff --git a/collectors/likwid/groups/ivybridge/FLOPS_AVX.txt b/collectors/likwid/groups/ivybridge/FLOPS_AVX.txt
new file mode 100644
index 0000000..526d550
--- /dev/null
+++ b/collectors/likwid/groups/ivybridge/FLOPS_AVX.txt
@@ -0,0 +1,25 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  SIMD_FP_256_PACKED_SINGLE
+PMC1  SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0)/time
+Packed DP [MFLOP/s]  1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formulas:
+Packed SP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed DP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+-
+Packed 32b AVX FLOPs rates. Please note that the current FLOP measurements on IvyBridge are
+potentially wrong. So you cannot trust these counters at the moment!
+
diff --git a/collectors/likwid/groups/ivybridge/FLOPS_DP.txt b/collectors/likwid/groups/ivybridge/FLOPS_DP.txt
new file mode 100644
index 0000000..e737098
--- /dev/null
+++ b/collectors/likwid/groups/ivybridge/FLOPS_DP.txt
@@ -0,0 +1,33 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2  SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+Vectorization ratio = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)
+-
+SSE scalar and packed double precision FLOP rates. Please note that the current
+FLOP measurements on IvyBridge are potentially wrong.
+So you cannot trust these counters at the moment!
+
diff --git a/collectors/likwid/groups/ivybridge/FLOPS_SP.txt b/collectors/likwid/groups/ivybridge/FLOPS_SP.txt
new file mode 100644
index 0000000..7483722
--- /dev/null
+++ b/collectors/likwid/groups/ivybridge/FLOPS_SP.txt
@@ -0,0 +1,33 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2  SIMD_FP_256_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
+
+LONG
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX SP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+Vectorization ratio = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)
+-
+SSE scalar and packed single precision FLOP rates. Please note that the current
+FLOP measurements on IvyBridge are potentially wrong.
+So you cannot trust these counters at the moment!
+
diff --git a/collectors/likwid/groups/ivybridge/ICACHE.txt b/collectors/likwid/groups/ivybridge/ICACHE.txt
new file mode 100644
index 0000000..f1e2335
--- /dev/null
+++ b/collectors/likwid/groups/ivybridge/ICACHE.txt
@@ -0,0 +1,33 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+PMC2  ICACHE_IFETCH_STALL
+PMC3  ILD_STALL_IQ_FULL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+L1I stalls PMC2
+L1I stall rate PMC2/FIXC0
+L1I queue full stalls PMC3
+L1I queue full stall rate PMC3/FIXC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I stalls = ICACHE_IFETCH_STALL
+L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY
+-
+This group measures some L1 instruction cache metrics.
diff --git a/collectors/likwid/groups/ivybridge/L2.txt b/collectors/likwid/groups/ivybridge/L2.txt
new file mode 100644
index 0000000..376e974
--- /dev/null
+++ b/collectors/likwid/groups/ivybridge/L2.txt
@@ -0,0 +1,38 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L1D_M_EVICT
+PMC2  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L1 and the number of modified cache lines
+evicted from the L1. The group also outputs total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and cache lines transferred it the instruction
+cache.
+
diff --git a/collectors/likwid/groups/ivybridge/L2CACHE.txt b/collectors/likwid/groups/ivybridge/L2CACHE.txt
new file mode 100644
index 0000000..9b5dd4b
--- /dev/null
+++ b/collectors/likwid/groups/ivybridge/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_TRANS_ALL_REQUESTS
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/ivybridge/L3.txt b/collectors/likwid/groups/ivybridge/L3.txt
new file mode 100644
index 0000000..f0a8aad
--- /dev/null
+++ b/collectors/likwid/groups/ivybridge/L3.txt
@@ -0,0 +1,36 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ALL
+PMC1  L2_LINES_OUT_DIRTY_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DIRTY_ALL*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DIRTY_ALL*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/collectors/likwid/groups/ivybridge/L3CACHE.txt b/collectors/likwid/groups/ivybridge/L3CACHE.txt
new file mode 100644
index 0000000..9f3036f
--- /dev/null
+++ b/collectors/likwid/groups/ivybridge/L3CACHE.txt
@@ -0,0 +1,36 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_LOAD_UOPS_RETIRED_L3_ALL
+PMC1  MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate (PMC0)/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/ivybridge/PORT_USAGE.txt b/collectors/likwid/groups/ivybridge/PORT_USAGE.txt
new file mode 100644
index 0000000..d509607
--- /dev/null
+++ b/collectors/likwid/groups/ivybridge/PORT_USAGE.txt
@@ -0,0 +1,40 @@
+SHORT  Execution port utilization
+
+REQUIRE_NOHT
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_DISPATCHED_PORT_PORT_0
+PMC1  UOPS_DISPATCHED_PORT_PORT_1
+PMC2  UOPS_DISPATCHED_PORT_PORT_2
+PMC3  UOPS_DISPATCHED_PORT_PORT_3
+PMC4  UOPS_DISPATCHED_PORT_PORT_4
+PMC5  UOPS_DISPATCHED_PORT_PORT_5
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Port0 usage ratio PMC0/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
+Port1 usage ratio PMC1/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
+Port2 usage ratio PMC2/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
+Port3 usage ratio PMC3/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
+Port4 usage ratio PMC4/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
+Port5 usage ratio PMC5/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
+
+LONG
+Formulas:
+Port0 usage ratio = UOPS_DISPATCHED_PORT_PORT_0/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port1 usage ratio = UOPS_DISPATCHED_PORT_PORT_1/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port2 usage ratio = UOPS_DISPATCHED_PORT_PORT_2/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port3 usage ratio = UOPS_DISPATCHED_PORT_PORT_3/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port4 usage ratio = UOPS_DISPATCHED_PORT_PORT_4/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port5 usage ratio = UOPS_DISPATCHED_PORT_PORT_5/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+-
+This group measures the execution port utilization in a CPU core. The group can
+only be measured when HyperThreading is disabled because only then each CPU core
+can program eight counters.
diff --git a/collectors/likwid/groups/ivybridge/RECOVERY.txt b/collectors/likwid/groups/ivybridge/RECOVERY.txt
new file mode 100644
index 0000000..7928ee4
--- /dev/null
+++ b/collectors/likwid/groups/ivybridge/RECOVERY.txt
@@ -0,0 +1,22 @@
+SHORT  Recovery duration
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  INT_MISC_RECOVERY_CYCLES
+PMC1  INT_MISC_RECOVERY_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Avg. recovery duration PMC0/PMC1
+
+LONG
+Formulas:
+Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT
+-
+This group measures the duration of recoveries after SSE exception, memory
+disambiguation, etc...
diff --git a/collectors/likwid/groups/ivybridge/TLB_DATA.txt b/collectors/likwid/groups/ivybridge/TLB_DATA.txt
new file mode 100644
index 0000000..8d94e05
--- /dev/null
+++ b/collectors/likwid/groups/ivybridge/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1  DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2  DTLB_LOAD_MISSES_WALK_DURATION
+PMC3  DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses     PMC1
+L1 DTLB store miss rate  PMC1/FIXC0
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/ivybridge/TLB_INSTR.txt b/collectors/likwid/groups/ivybridge/TLB_INSTR.txt
new file mode 100644
index 0000000..235d977
--- /dev/null
+++ b/collectors/likwid/groups/ivybridge/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_CAUSES_A_WALK
+PMC1  ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/ivybridge/TMA.txt b/collectors/likwid/groups/ivybridge/TMA.txt
new file mode 100644
index 0000000..afb4126
--- /dev/null
+++ b/collectors/likwid/groups/ivybridge/TMA.txt
@@ -0,0 +1,48 @@
+SHORT Top down cycle allocation
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_ANY
+PMC1 UOPS_RETIRED_RETIRE_SLOTS
+PMC2 IDQ_UOPS_NOT_DELIVERED_CORE
+PMC3 INT_MISC_RECOVERY_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+IPC FIXC0/FIXC1
+Total Slots 4*FIXC1
+Slots Retired PMC1
+Fetch Bubbles PMC2
+Recovery Bubbles 4*PMC3
+Front End [%] PMC2/(4*FIXC1)*100
+Speculation [%] (PMC0-PMC1+(4*PMC3))/(4*FIXC1)*100
+Retiring [%] PMC1/(4*FIXC1)*100
+Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100
+
+LONG
+Formulas:
+Total Slots = 4*CPU_CLK_UNHALTED_CORE
+Slots Retired = UOPS_RETIRED_RETIRE_SLOTS
+Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE
+Recovery Bubbles = 4*INT_MISC_RECOVERY_CYCLES
+Front End [%] = IDQ_UOPS_NOT_DELIVERED_CORE/(4*CPU_CLK_UNHALTED_CORE)*100
+Speculation [%] = (UOPS_ISSUED_ANY-UOPS_RETIRED_RETIRE_SLOTS+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)*100
+Retiring [%] = UOPS_RETIRED_RETIRE_SLOTS/(4*CPU_CLK_UNHALTED_CORE)*100
+Back End [%] = (1-((IDQ_UOPS_NOT_DELIVERED_CORE+UOPS_ISSUED_ANY+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)))*100
+--
+This performance group measures cycles to determine percentage of time spent in
+front end, back end, retiring and speculation. These metrics are published and
+verified by Intel. Further information:
+Webpage describing Top-Down Method and its usage in Intel vTune:
+https://software.intel.com/en-us/vtune-amplifier-help-tuning-applications-using-a-top-down-microarchitecture-analysis-method
+Paper by Yasin Ahmad:
+https://sites.google.com/site/analysismethods/yasin-pubs/TopDown-Yasin-ISPASS14.pdf?attredirects=0
+Slides by Yasin Ahmad:
+http://www.cs.technion.ac.il/~erangi/TMA_using_Linux_perf__Ahmad_Yasin.pdf
+The performance group was originally published here:
+http://perf.mvermeulen.com/2018/04/14/top-down-performance-counter-analysis-part-1-likwid/
diff --git a/collectors/likwid/groups/ivybridge/UOPS.txt b/collectors/likwid/groups/ivybridge/UOPS.txt
new file mode 100644
index 0000000..e6cc208
--- /dev/null
+++ b/collectors/likwid/groups/ivybridge/UOPS.txt
@@ -0,0 +1,35 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_ANY
+PMC1  UOPS_EXECUTED_THREAD
+PMC2  UOPS_RETIRED_ALL
+PMC3  UOPS_ISSUED_FLAGS_MERGE
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Issued UOPs PMC0
+Merged UOPs PMC3
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formulas:
+Issued UOPs = UOPS_ISSUED_ANY
+Merged UOPs = UOPS_ISSUED_FLAGS_MERGE
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs and returns the number of uOPs which were issued
+but not executed as well as the number of uOPs which were executed but never retired.
+The executed but not retired uOPs commonly come from speculatively executed branches.
+
diff --git a/collectors/likwid/groups/ivybridge/UOPS_EXEC.txt b/collectors/likwid/groups/ivybridge/UOPS_EXEC.txt
new file mode 100644
index 0000000..7042df7
--- /dev/null
+++ b/collectors/likwid/groups/ivybridge/UOPS_EXEC.txt
@@ -0,0 +1,31 @@
+SHORT UOPs execution
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_EXECUTED_USED_CYCLES
+PMC1  UOPS_EXECUTED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_EXECUTED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the execution stage in the pipeline. Used cycles are all cycles where uOPs are
+executed while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/collectors/likwid/groups/ivybridge/UOPS_ISSUE.txt b/collectors/likwid/groups/ivybridge/UOPS_ISSUE.txt
new file mode 100644
index 0000000..9aac923
--- /dev/null
+++ b/collectors/likwid/groups/ivybridge/UOPS_ISSUE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs issueing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_USED_CYCLES
+PMC1  UOPS_ISSUED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_ISSUED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the issue stage in the pipeline. Used cycles are all cycles where uOPs are
+issued while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/collectors/likwid/groups/ivybridge/UOPS_RETIRE.txt b/collectors/likwid/groups/ivybridge/UOPS_RETIRE.txt
new file mode 100644
index 0000000..0f37585
--- /dev/null
+++ b/collectors/likwid/groups/ivybridge/UOPS_RETIRE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs retirement
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_RETIRED_USED_CYCLES
+PMC1  UOPS_RETIRED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_RETIRED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the retirement stage in the pipeline (re-order buffer). Used cycles are all
+cycles where uOPs are retired while unused cycles refer to pipeline stalls.
+Moreover, the group calculates the average stall duration in cycles.
diff --git a/collectors/likwid/groups/ivybridgeEP/BRANCH.txt b/collectors/likwid/groups/ivybridgeEP/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/ivybridgeEP/CACHES.txt b/collectors/likwid/groups/ivybridgeEP/CACHES.txt
new file mode 100644
index 0000000..fd1d43f
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/CACHES.txt
@@ -0,0 +1,121 @@
+SHORT Cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L1D_M_EVICT
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_LINES_OUT_DIRTY_ALL
+CBOX0C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX1C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX2C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX3C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX4C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX5C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX6C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX7C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX8C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX9C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX10C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX11C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX12C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX13C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX14C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX0C1 LLC_VICTIMS_M_STATE
+CBOX1C1 LLC_VICTIMS_M_STATE
+CBOX2C1 LLC_VICTIMS_M_STATE
+CBOX3C1 LLC_VICTIMS_M_STATE
+CBOX4C1 LLC_VICTIMS_M_STATE
+CBOX5C1 LLC_VICTIMS_M_STATE
+CBOX6C1 LLC_VICTIMS_M_STATE
+CBOX7C1 LLC_VICTIMS_M_STATE
+CBOX8C1 LLC_VICTIMS_M_STATE
+CBOX9C1 LLC_VICTIMS_M_STATE
+CBOX10C1 LLC_VICTIMS_M_STATE
+CBOX11C1 LLC_VICTIMS_M_STATE
+CBOX12C1 LLC_VICTIMS_M_STATE
+CBOX13C1 LLC_VICTIMS_M_STATE
+CBOX14C1 LLC_VICTIMS_M_STATE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L3 to L2 load bandwidth [MBytes/s]  1.0E-06*PMC2*64.0/time
+L3 to L2 load data volume [GBytes]  1.0E-09*PMC2*64.0
+L2 to L3 evict bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time
+L2 to L3 evict data volume [GBytes]  1.0E-09*PMC3*64.0
+L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F+CBOX8C0:STATE=0x3F+CBOX9C0:STATE=0x3F+CBOX10C0:STATE=0x3F+CBOX11C0:STATE=0x3F+CBOX12C0:STATE=0x3F+CBOX13C0:STATE=0x3F+CBOX14C0:STATE=0x3F)*64.0/time
+System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F+CBOX8C0:STATE=0x3F+CBOX9C0:STATE=0x3F+     CBOX10C0:STATE=0x3F+CBOX11C0:STATE=0x3F+CBOX12C0:STATE=0x3F+CBOX13C0:STATE=0x3F+CBOX14C0:STATE=0x3F)*64.0
+L3 to memory bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1)*64/time
+L3 to memory data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1)*64
+L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F+CBOX8C0:STATE=0x3F+CBOX9C0:STATE=0x3F+     CBOX10C0:STATE=0x3F+CBOX11C0:STATE=0x3F+CBOX12C0:STATE=0x3F+CBOX13C0:STATE=0x3F+CBOX14C0:STATE=0x3F+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1)*64.0/time
+L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F+CBOX8C0:STATE=0x3F+CBOX9C0:STATE=0x3F+     CBOX10C0:STATE=0x3F+CBOX11C0:STATE=0x3F+CBOX12C0:STATE=0x3F+CBOX13C0:STATE=0x3F+CBOX14C0:STATE=0x3F+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1)*64.0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
+L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64
+L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
+L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64
+L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
+L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
+L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time
+L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64
+L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time
+L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64
+L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F))*64/time
+System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F))*64
+L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M_STATE))*64/time
+L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M_STATE))*64
+L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F)+SUM(LLC_VICTIMS_M_STATE))*64/time
+L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F)+SUM(LLC_VICTIMS_M_STATE))*64
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
+-
+Group to measure cache transfers between L1 and Memory. Please notice that the
+L3 to/from system metrics contain any traffic to the system (memory,
+Intel QPI, etc.) but don't seem to handle anything because commonly memory read
+bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth.
+
diff --git a/collectors/likwid/groups/ivybridgeEP/CBOX.txt b/collectors/likwid/groups/ivybridgeEP/CBOX.txt
new file mode 100644
index 0000000..5c87149
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/CBOX.txt
@@ -0,0 +1,55 @@
+SHORT CBOX related data and metrics
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+CBOX0C0 LLC_VICTIMS_M_STATE
+CBOX1C0 LLC_VICTIMS_M_STATE
+CBOX2C0 LLC_VICTIMS_M_STATE
+CBOX3C0 LLC_VICTIMS_M_STATE
+CBOX4C0 LLC_VICTIMS_M_STATE
+CBOX5C0 LLC_VICTIMS_M_STATE
+CBOX6C0 LLC_VICTIMS_M_STATE
+CBOX7C0 LLC_VICTIMS_M_STATE
+CBOX8C0 LLC_VICTIMS_M_STATE
+CBOX9C0 LLC_VICTIMS_M_STATE
+CBOX10C0 LLC_VICTIMS_M_STATE
+CBOX11C0 LLC_VICTIMS_M_STATE
+CBOX12C0 LLC_VICTIMS_M_STATE
+CBOX13C0 LLC_VICTIMS_M_STATE
+CBOX14C0 LLC_VICTIMS_M_STATE
+CBOX0C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX1C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX2C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX3C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX4C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX5C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX6C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX7C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX8C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX9C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX10C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX11C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX12C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX13C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX14C1:STATE=0x1 LLC_LOOKUP_ANY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+LLC misses per instruction (CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0)/FIXC0
+LLC data written to MEM [MBytes] 1E-6*(CBOX0C1:STATE=0x1+CBOX1C1:STATE=0x1+CBOX2C1:STATE=0x1+CBOX3C1:STATE=0x1+CBOX4C1:STATE=0x1+CBOX5C1:STATE=0x1+CBOX6C1:STATE=0x1+CBOX7C1:STATE=0x1+CBOX8C1:STATE=0x1+CBOX9C1:STATE=0x1+CBOX10C1:STATE=0x1+CBOX11C1:STATE=0x1+CBOX12C1:STATE=0x1+CBOX13C1:STATE=0x1+CBOX14C1:STATE=0x1)*64
+
+
+LONG
+Formulas:
+LLC misses per instruction = sum(LLC_VICTIMS_M_STATE)/INSTR_RETIRED_ANY
+LLC data written to MEM [MBytes] = sum(LLC_LOOKUP_ANY:STATE=0x1)*64*1E-6
+--
+The CBOXes mediate the traffic from the L2 cache to the segmented L3 cache. Each
+CBOX is responsible for one segment (2.5 MByte). The boxes maintain the coherence between all
+CPU cores of the socket. Depending on the CPU core count, some CBOXes are not attached
+to a 2.5 MByte slice but are still active and track the traffic.
diff --git a/collectors/likwid/groups/ivybridgeEP/CLOCK.txt b/collectors/likwid/groups/ivybridgeEP/CLOCK.txt
new file mode 100644
index 0000000..fb19101
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/CLOCK.txt
@@ -0,0 +1,26 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+UBOXFIX UNCORE_CLOCK
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+Uncore Clock [MHz] 1.E-06*UBOXFIX/time
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+
+LONG
+Formulas:
+Power =  PWR_PKG_ENERGY / time
+Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
+-
+IvyBridge implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/collectors/likwid/groups/ivybridgeEP/CYCLE_ACTIVITY.txt b/collectors/likwid/groups/ivybridgeEP/CYCLE_ACTIVITY.txt
new file mode 100644
index 0000000..c432a44
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/CYCLE_ACTIVITY.txt
@@ -0,0 +1,38 @@
+SHORT Cycle Activities
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING
+PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING
+PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING
+PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Cycles without execution [%] (PMC3/FIXC1)*100
+Cycles without execution due to L1D [%] (PMC2/FIXC1)*100
+Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
+Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100
+
+LONG
+Formulas:
+Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
+Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
+Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
+Cycles without execution due to memory loads [%] = CYCLE_ACTIVITY_CYCLES_LDM_PENDING/CPU_CLK_UNHALTED_CORE*100
+--
+This performance group measures the cycles while waiting for data from the cache
+and memory hierarchy.
+CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on
+any execution port.
+CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is
+outstanding.
+CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is
+outstanding.
+CYCLE_ACTIVITY_CYCLES_LDM_PENDING: Cycles while memory subsystem has an
+outstanding load.
diff --git a/collectors/likwid/groups/ivybridgeEP/CYCLE_STALLS.txt b/collectors/likwid/groups/ivybridgeEP/CYCLE_STALLS.txt
new file mode 100644
index 0000000..795aeb9
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/CYCLE_STALLS.txt
@@ -0,0 +1,45 @@
+SHORT Cycle Activities (Stalls)
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING
+PMC1 CYCLE_ACTIVITY_STALLS_LDM_PENDING
+PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING
+PMC3 CYCLE_ACTIVITY_STALLS_TOTAL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Total execution stalls PMC3
+Stalls caused by L1D misses [%] (PMC2/PMC3)*100
+Stalls caused by L2 misses [%] (PMC0/PMC3)*100
+Stalls caused by memory loads [%] (PMC1/PMC3)*100
+Execution stall rate [%] (PMC3/FIXC1)*100
+Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100
+Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
+Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100
+
+LONG
+Formulas:
+Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
+Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Stalls caused by memory loads [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by memory loads rate [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CPU_CLK_UNHALTED_CORE)*100
+--
+This performance group measures the stalls caused by data traffic in the cache
+hierarchy.
+CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls.
+CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand
+load is outstanding.
+CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand
+load is outstanding.
+CYCLE_ACTIVITY_STALLS_LDM_PENDING: Execution stalls while memory subsystem has
+an outstanding load.
diff --git a/collectors/likwid/groups/ivybridgeEP/DATA.txt b/collectors/likwid/groups/ivybridgeEP/DATA.txt
new file mode 100644
index 0000000..967cbad
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOPS_RETIRED_LOADS
+PMC1  MEM_UOPS_RETIRED_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/ivybridgeEP/DIVIDE.txt b/collectors/likwid/groups/ivybridgeEP/DIVIDE.txt
new file mode 100644
index 0000000..f8cb0b3
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/DIVIDE.txt
@@ -0,0 +1,24 @@
+SHORT Divide unit information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 ARITH_NUM_DIV
+PMC1 ARITH_FPU_DIV_ACTIVE
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Number of divide ops PMC0
+Avg. divide unit usage duration PMC1/PMC0
+
+LONG
+Formulas:
+Number of divide ops = ARITH_NUM_DIV
+Avg. divide unit usage duration = ARITH_FPU_DIV_ACTIVE/ARITH_NUM_DIV
+-
+This performance group measures the average latency of divide operations
diff --git a/collectors/likwid/groups/ivybridgeEP/ENERGY.txt b/collectors/likwid/groups/ivybridgeEP/ENERGY.txt
new file mode 100644
index 0000000..74c16bb
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/ENERGY.txt
@@ -0,0 +1,33 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formulas:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+IvyBridge implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket), the PP0 domain
+and DRAM level. The PP0 domain often refers to only the CPU cores.
diff --git a/collectors/likwid/groups/ivybridgeEP/FALSE_SHARE.txt b/collectors/likwid/groups/ivybridgeEP/FALSE_SHARE.txt
new file mode 100644
index 0000000..5e28a15
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/FALSE_SHARE.txt
@@ -0,0 +1,32 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM
+PMC1 MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_HITM
+PMC2 MEM_LOAD_UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local LLC false sharing [MByte] 1.E-06*PMC0*64
+Local LLC false sharing rate PMC0/PMC2
+Remote LLC false sharing [MByte] 1.E-06*PMC1*64
+Remote LLC false sharing rate PMC1/PMC2
+
+LONG
+Formulas:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+Remote LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_HITM*64
+Remote LLC false sharing rate = MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory loads as reference.
+For systems with multiple CPU sockets, this performance group also measures the
+false-sharing of cache lines over socket boundaries.
diff --git a/collectors/likwid/groups/ivybridgeEP/FLOPS_AVX.txt b/collectors/likwid/groups/ivybridgeEP/FLOPS_AVX.txt
new file mode 100644
index 0000000..0ad669f
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/FLOPS_AVX.txt
@@ -0,0 +1,26 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  SIMD_FP_256_PACKED_SINGLE
+PMC1  SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0)/time
+Packed DP [MFLOP/s]  1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formulas:
+Packed SP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed DP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+-
+Packed 32b AVX FLOPs rates. Please note that the current FLOP measurements on
+IvyBridge are potentially wrong.
+So you cannot trust these counters at the moment!
+
diff --git a/collectors/likwid/groups/ivybridgeEP/FLOPS_DP.txt b/collectors/likwid/groups/ivybridgeEP/FLOPS_DP.txt
new file mode 100644
index 0000000..e737098
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/FLOPS_DP.txt
@@ -0,0 +1,33 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2  SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+Vectorization ratio = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)
+-
+SSE scalar and packed double precision FLOP rates. Please note that the current
+FLOP measurements on IvyBridge are potentially wrong.
+So you cannot trust these counters at the moment!
+
diff --git a/collectors/likwid/groups/ivybridgeEP/FLOPS_SP.txt b/collectors/likwid/groups/ivybridgeEP/FLOPS_SP.txt
new file mode 100644
index 0000000..7483722
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/FLOPS_SP.txt
@@ -0,0 +1,33 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2  SIMD_FP_256_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
+
+LONG
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX SP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+Vectorization ratio = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)
+-
+SSE scalar and packed single precision FLOP rates. Please note that the current
+FLOP measurements on IvyBridge are potentially wrong.
+So you cannot trust these counters at the moment!
+
diff --git a/collectors/likwid/groups/ivybridgeEP/ICACHE.txt b/collectors/likwid/groups/ivybridgeEP/ICACHE.txt
new file mode 100644
index 0000000..f1e2335
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/ICACHE.txt
@@ -0,0 +1,33 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+PMC2  ICACHE_IFETCH_STALL
+PMC3  ILD_STALL_IQ_FULL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+L1I stalls PMC2
+L1I stall rate PMC2/FIXC0
+L1I queue full stalls PMC3
+L1I queue full stall rate PMC3/FIXC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I stalls = ICACHE_IFETCH_STALL
+L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY
+-
+This group measures some L1 instruction cache metrics.
diff --git a/collectors/likwid/groups/ivybridgeEP/L2.txt b/collectors/likwid/groups/ivybridgeEP/L2.txt
new file mode 100644
index 0000000..376e974
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/L2.txt
@@ -0,0 +1,38 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L1D_M_EVICT
+PMC2  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L1 and the number of modified cache lines
+evicted from the L1. The group also outputs total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and cache lines transferred it the instruction
+cache.
+
diff --git a/collectors/likwid/groups/ivybridgeEP/L2CACHE.txt b/collectors/likwid/groups/ivybridgeEP/L2CACHE.txt
new file mode 100644
index 0000000..9b5dd4b
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_TRANS_ALL_REQUESTS
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/ivybridgeEP/L3.txt b/collectors/likwid/groups/ivybridgeEP/L3.txt
new file mode 100644
index 0000000..f0a8aad
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/L3.txt
@@ -0,0 +1,36 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ALL
+PMC1  L2_LINES_OUT_DIRTY_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DIRTY_ALL*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DIRTY_ALL*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/collectors/likwid/groups/ivybridgeEP/L3CACHE.txt b/collectors/likwid/groups/ivybridgeEP/L3CACHE.txt
new file mode 100644
index 0000000..9f3036f
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/L3CACHE.txt
@@ -0,0 +1,36 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_LOAD_UOPS_RETIRED_L3_ALL
+PMC1  MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate (PMC0)/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/ivybridgeEP/MEM.txt b/collectors/likwid/groups/ivybridgeEP/MEM.txt
new file mode 100644
index 0000000..fd80c2c
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/MEM.txt
@@ -0,0 +1,49 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on a
+per socket base. Some of the counters may not be available on your system.
+Also outputs total data volume transferred from main memory.
+
diff --git a/collectors/likwid/groups/ivybridgeEP/MEM_DP.txt b/collectors/likwid/groups/ivybridgeEP/MEM_DP.txt
new file mode 100644
index 0000000..eff1677
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/MEM_DP.txt
@@ -0,0 +1,75 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2  SIMD_FP_256_PACKED_DOUBLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX [MFLOP/s] 1.0E-06*(PMC2*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Operational intensity (PMC0*2.0+PMC1+PMC2*4.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0)
+
+LONG
+Formulas:
+Power [W] = PWR_PKG_ENERGY/runtime
+Power DRAM [W] = PWR_DRAM_ENERGY/runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+Operational intensity = (FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0)
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed double precision FLOP rates. Also reports on packed AVX
+32b instructions.  Please note that the current FLOP measurements on SandyBridge
+are potentially wrong. So you cannot trust these counters at the moment!
+The operational intensity is calculated using the FP values of the cores and the
+memory data volume of the whole socket. The actual operational intensity for
+multiple CPUs can be found in the statistics table in the Sum column.
+
diff --git a/collectors/likwid/groups/ivybridgeEP/MEM_SP.txt b/collectors/likwid/groups/ivybridgeEP/MEM_SP.txt
new file mode 100644
index 0000000..e541340
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/MEM_SP.txt
@@ -0,0 +1,74 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2  SIMD_FP_256_PACKED_SINGLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX [MFLOP/s] 1.0E-06*(PMC2*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Operational intensity (PMC0*4.0+PMC1+PMC2*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0)
+
+LONG
+Formulas:
+Power [W] = PWR_PKG_ENERGY/runtime
+Power DRAM [W] = PWR_DRAM_ENERGY/runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+Operational intensity = (FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0)
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed single precision FLOP rates. Also reports on packed AVX
+32b instructions. Please note that the current FLOP measurements on IvyBridge
+are potentially wrong. So you cannot trust these counters at the moment!
+The operational intensity is calculated using the FP values of the cores and the
+memory data volume of the whole socket. The actual operational intensity for
+multiple CPUs can be found in the statistics table in the Sum column.
diff --git a/collectors/likwid/groups/ivybridgeEP/NUMA.txt b/collectors/likwid/groups/ivybridgeEP/NUMA.txt
new file mode 100644
index 0000000..41fbe62
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/NUMA.txt
@@ -0,0 +1,33 @@
+SHORT Local and remote memory accesses
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 OFFCORE_RESPONSE_0_LOCAL_DRAM
+PMC1 OFFCORE_RESPONSE_1_REMOTE_DRAM
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local DRAM data volume [GByte]  1.E-09*PMC0*64
+Local DRAM bandwidth [MByte/s]  1.E-06*(PMC0*64)/time
+Remote DRAM data volume [GByte]  1.E-09*PMC1*64
+Remote DRAM bandwidth [MByte/s]  1.E-06*(PMC1*64)/time
+Memory data volume [GByte]  1.E-09*(PMC0+PMC1)*64
+Memory bandwidth [MByte/s]  1.E-06*((PMC0+PMC1)*64)/time
+
+LONG
+Formulas:
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+Local DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_0_LOCAL_DRAM*64
+Local DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_0_LOCAL_DRAM*64)/time
+Remote DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_1_REMOTE_DRAM*64
+Remote DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_1_REMOTE_DRAM*64)/time
+Memory data volume [GByte] = 1.E-09*(OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64
+Memory bandwidth [MByte/s] = 1.E-06*((OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64)/time
+--
+This performance group measures the data traffic of CPU cores to local and remote
+memory.
diff --git a/collectors/likwid/groups/ivybridgeEP/PORT_USAGE.txt b/collectors/likwid/groups/ivybridgeEP/PORT_USAGE.txt
new file mode 100644
index 0000000..d509607
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/PORT_USAGE.txt
@@ -0,0 +1,40 @@
+SHORT  Execution port utilization
+
+REQUIRE_NOHT
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_DISPATCHED_PORT_PORT_0
+PMC1  UOPS_DISPATCHED_PORT_PORT_1
+PMC2  UOPS_DISPATCHED_PORT_PORT_2
+PMC3  UOPS_DISPATCHED_PORT_PORT_3
+PMC4  UOPS_DISPATCHED_PORT_PORT_4
+PMC5  UOPS_DISPATCHED_PORT_PORT_5
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Port0 usage ratio PMC0/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
+Port1 usage ratio PMC1/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
+Port2 usage ratio PMC2/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
+Port3 usage ratio PMC3/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
+Port4 usage ratio PMC4/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
+Port5 usage ratio PMC5/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
+
+LONG
+Formulas:
+Port0 usage ratio = UOPS_DISPATCHED_PORT_PORT_0/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port1 usage ratio = UOPS_DISPATCHED_PORT_PORT_1/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port2 usage ratio = UOPS_DISPATCHED_PORT_PORT_2/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port3 usage ratio = UOPS_DISPATCHED_PORT_PORT_3/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port4 usage ratio = UOPS_DISPATCHED_PORT_PORT_4/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port5 usage ratio = UOPS_DISPATCHED_PORT_PORT_5/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+-
+This group measures the execution port utilization in a CPU core. The group can
+only be measured when HyperThreading is disabled because only then each CPU core
+can program eight counters.
diff --git a/collectors/likwid/groups/ivybridgeEP/QPI.txt b/collectors/likwid/groups/ivybridgeEP/QPI.txt
new file mode 100644
index 0000000..a2f1339
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/QPI.txt
@@ -0,0 +1,52 @@
+SHORT QPI Link Layer data
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+SBOX0C0 DIRECT2CORE_SUCCESS_RBT_HIT
+SBOX1C0 DIRECT2CORE_SUCCESS_RBT_HIT
+SBOX2C0 DIRECT2CORE_SUCCESS_RBT_HIT
+SBOX0C1 TXL_FLITS_G0_DATA
+SBOX1C1 TXL_FLITS_G0_DATA
+SBOX2C1 TXL_FLITS_G0_DATA
+SBOX0C2 TXL_FLITS_G0_NON_DATA
+SBOX1C2 TXL_FLITS_G0_NON_DATA
+SBOX2C2 TXL_FLITS_G0_NON_DATA
+SBOX0C3 SBOX_CLOCKTICKS
+SBOX1C3 SBOX_CLOCKTICKS
+SBOX2C3 SBOX_CLOCKTICKS
+SBOX0FIX QPI_RATE
+SBOX1FIX QPI_RATE
+SBOX2FIX QPI_RATE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+QPI Speed Link 0 [GT/s] ((SBOX0C3)/time)*inverseClock*(8/1000)
+QPI Speed Link 1 [GT/s] ((SBOX1C3)/time)*inverseClock*(8/1000)
+QPI Speed Link 2 [GT/s] ((SBOX2C3)/time)*inverseClock*(8/1000)
+QPI Rate Link 0 [GT/s] 1.E-09*SBOX0FIX
+QPI Rate Link 1 [GT/s] 1.E-09*SBOX1FIX
+QPI Rate Link 2 [GT/s] 1.E-09*SBOX2FIX
+data from QPI to LLC [MByte] 1.E-06*(SBOX0C0+SBOX1C0+SBOX2C0)*8
+QPI data volume [MByte] 1.E-06*(SBOX0C1+SBOX1C1+SBOX2C1)*8
+QPI data bandwidth [MByte/s] 1.E-06*(SBOX0C1+SBOX1C1+SBOX2C1)*8/time
+QPI link volume [MByte] 1.E-06*(SBOX0C1+SBOX1C1+SBOX2C1+SBOX0C2+SBOX1C2+SBOX2C2)*8
+QPI link bandwidth [MByte/s] 1.E-06*(SBOX0C1+SBOX1C1+SBOX2C1+SBOX0C2+SBOX1C2+SBOX2C2)*8/time
+
+LONG
+Formulas:
+QPI Speed Link 0/1/2 [GT/s] = ((SBOX_CLOCKTICKS)/time)*clock*(8/1000)
+QPI Rate Link 0/1/2 [GT/s] = 1.E-09*(QPI_RATE)
+data from QPI to LLC [MByte] = 1.E-06*(sum(DIRECT2CORE_SUCCESS_RBT_HIT)*64)
+QPI data volume [MByte] = 1.E-06*(sum(TXL_FLITS_G0_DATA)*8)
+QPI data bandwidth [MByte/s] = 1.E-06*(sum(TXL_FLITS_G0_DATA)*8)/runtime
+QPI link volume [MByte] = 1.E-06*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8)
+QPI link bandwidth [MByte/s] = 1.E-06*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8)/runtime
+--
+The Intel QPI Link Layer is responsible for packetizing requests from the caching agent (CBOXes)
+on the way out to the system interface.
+
diff --git a/collectors/likwid/groups/ivybridgeEP/RECOVERY.txt b/collectors/likwid/groups/ivybridgeEP/RECOVERY.txt
new file mode 100644
index 0000000..7928ee4
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/RECOVERY.txt
@@ -0,0 +1,22 @@
+SHORT  Recovery duration
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  INT_MISC_RECOVERY_CYCLES
+PMC1  INT_MISC_RECOVERY_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Avg. recovery duration PMC0/PMC1
+
+LONG
+Formulas:
+Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT
+-
+This group measures the duration of recoveries after SSE exception, memory
+disambiguation, etc...
diff --git a/collectors/likwid/groups/ivybridgeEP/TLB_DATA.txt b/collectors/likwid/groups/ivybridgeEP/TLB_DATA.txt
new file mode 100644
index 0000000..8d94e05
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1  DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2  DTLB_LOAD_MISSES_WALK_DURATION
+PMC3  DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses     PMC1
+L1 DTLB store miss rate  PMC1/FIXC0
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/ivybridgeEP/TLB_INSTR.txt b/collectors/likwid/groups/ivybridgeEP/TLB_INSTR.txt
new file mode 100644
index 0000000..235d977
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_CAUSES_A_WALK
+PMC1  ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/ivybridgeEP/TMA.txt b/collectors/likwid/groups/ivybridgeEP/TMA.txt
new file mode 100644
index 0000000..afb4126
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/TMA.txt
@@ -0,0 +1,48 @@
+SHORT Top down cycle allocation
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_ANY
+PMC1 UOPS_RETIRED_RETIRE_SLOTS
+PMC2 IDQ_UOPS_NOT_DELIVERED_CORE
+PMC3 INT_MISC_RECOVERY_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+IPC FIXC0/FIXC1
+Total Slots 4*FIXC1
+Slots Retired PMC1
+Fetch Bubbles PMC2
+Recovery Bubbles 4*PMC3
+Front End [%] PMC2/(4*FIXC1)*100
+Speculation [%] (PMC0-PMC1+(4*PMC3))/(4*FIXC1)*100
+Retiring [%] PMC1/(4*FIXC1)*100
+Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100
+
+LONG
+Formulas:
+Total Slots = 4*CPU_CLK_UNHALTED_CORE
+Slots Retired = UOPS_RETIRED_RETIRE_SLOTS
+Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE
+Recovery Bubbles = 4*INT_MISC_RECOVERY_CYCLES
+Front End [%] = IDQ_UOPS_NOT_DELIVERED_CORE/(4*CPU_CLK_UNHALTED_CORE)*100
+Speculation [%] = (UOPS_ISSUED_ANY-UOPS_RETIRED_RETIRE_SLOTS+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)*100
+Retiring [%] = UOPS_RETIRED_RETIRE_SLOTS/(4*CPU_CLK_UNHALTED_CORE)*100
+Back End [%] = (1-((IDQ_UOPS_NOT_DELIVERED_CORE+UOPS_ISSUED_ANY+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)))*100
+--
+This performance group measures cycles to determine percentage of time spent in
+front end, back end, retiring and speculation. These metrics are published and
+verified by Intel. Further information:
+Webpage describing Top-Down Method and its usage in Intel vTune:
+https://software.intel.com/en-us/vtune-amplifier-help-tuning-applications-using-a-top-down-microarchitecture-analysis-method
+Paper by Yasin Ahmad:
+https://sites.google.com/site/analysismethods/yasin-pubs/TopDown-Yasin-ISPASS14.pdf?attredirects=0
+Slides by Yasin Ahmad:
+http://www.cs.technion.ac.il/~erangi/TMA_using_Linux_perf__Ahmad_Yasin.pdf
+The performance group was originally published here:
+http://perf.mvermeulen.com/2018/04/14/top-down-performance-counter-analysis-part-1-likwid/
diff --git a/collectors/likwid/groups/ivybridgeEP/UNCORECLOCK.txt b/collectors/likwid/groups/ivybridgeEP/UNCORECLOCK.txt
new file mode 100644
index 0000000..1cc1f98
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/UNCORECLOCK.txt
@@ -0,0 +1,96 @@
+SHORT All Clocks
+
+EVENTSET
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+CBOX0C0 CBOX_CLOCKTICKS
+CBOX1C0 CBOX_CLOCKTICKS
+CBOX2C0 CBOX_CLOCKTICKS
+CBOX3C0 CBOX_CLOCKTICKS
+CBOX4C0 CBOX_CLOCKTICKS
+CBOX5C0 CBOX_CLOCKTICKS
+CBOX6C0 CBOX_CLOCKTICKS
+CBOX7C0 CBOX_CLOCKTICKS
+CBOX8C0 CBOX_CLOCKTICKS
+CBOX9C0 CBOX_CLOCKTICKS
+CBOX10C0 CBOX_CLOCKTICKS
+CBOX11C0 CBOX_CLOCKTICKS
+CBOX12C0 CBOX_CLOCKTICKS
+CBOX13C0 CBOX_CLOCKTICKS
+CBOX14C0 CBOX_CLOCKTICKS
+MBOX0C0 DRAM_CLOCKTICKS
+MBOX1C0 DRAM_CLOCKTICKS
+MBOX2C0 DRAM_CLOCKTICKS
+MBOX3C0 DRAM_CLOCKTICKS
+MBOX0FIX DRAM_CLOCKTICKS
+MBOX1FIX DRAM_CLOCKTICKS
+MBOX2FIX DRAM_CLOCKTICKS
+MBOX3FIX DRAM_CLOCKTICKS
+SBOX0C0 SBOX_CLOCKTICKS
+SBOX1C0 SBOX_CLOCKTICKS
+SBOX2C0 SBOX_CLOCKTICKS
+UBOXFIX UNCORE_CLOCK
+BBOX0C0 BBOX_CLOCKTICKS
+BBOX1C0 BBOX_CLOCKTICKS
+WBOX0 WBOX_CLOCKTICKS
+PBOX0 PBOX_CLOCKTICKS
+RBOX0C0 RBOX_CLOCKTICKS
+RBOX1C0 RBOX_CLOCKTICKS
+RBOX2C0 RBOX_CLOCKTICKS
+IBOX0C0 IBOX_CLOCKTICKS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+UBOX Frequency [GHz] 1.E-09*UBOXFIX/(FIXC1*inverseClock)
+CBOX0 Frequency [GHz] 1.E-09*CBOX0C0/(FIXC1*inverseClock)
+CBOX1 Frequency [GHz] 1.E-09*CBOX1C0/(FIXC1*inverseClock)
+CBOX2 Frequency [GHz] 1.E-09*CBOX2C0/(FIXC1*inverseClock)
+CBOX3 Frequency [GHz] 1.E-09*CBOX3C0/(FIXC1*inverseClock)
+CBOX4 Frequency [GHz] 1.E-09*CBOX4C0/(FIXC1*inverseClock)
+CBOX5 Frequency [GHz] 1.E-09*CBOX5C0/(FIXC1*inverseClock)
+CBOX6 Frequency [GHz] 1.E-09*CBOX6C0/(FIXC1*inverseClock)
+CBOX7 Frequency [GHz] 1.E-09*CBOX7C0/(FIXC1*inverseClock)
+CBOX8 Frequency [GHz] 1.E-09*CBOX8C0/(FIXC1*inverseClock)
+CBOX9 Frequency [GHz] 1.E-09*CBOX9C0/(FIXC1*inverseClock)
+CBOX10 Frequency [GHz] 1.E-09*CBOX10C0/(FIXC1*inverseClock)
+CBOX11 Frequency [GHz] 1.E-09*CBOX11C0/(FIXC1*inverseClock)
+CBOX12 Frequency [GHz] 1.E-09*CBOX12C0/(FIXC1*inverseClock)
+CBOX13 Frequency [GHz] 1.E-09*CBOX13C0/(FIXC1*inverseClock)
+CBOX14 Frequency [GHz] 1.E-09*CBOX14C0/(FIXC1*inverseClock)
+MBOX0 Frequency [GHz] 1.E-09*MBOX0C0/(FIXC1*inverseClock)
+MBOX0FIX Frequency [GHz] 1.E-09*MBOX0FIX/(FIXC1*inverseClock)
+MBOX1 Frequency [GHz] 1.E-09*MBOX1C0/(FIXC1*inverseClock)
+MBOX1FIX Frequency [GHz] 1.E-09*MBOX1FIX/(FIXC1*inverseClock)
+MBOX2 Frequency [GHz] 1.E-09*MBOX2C0/(FIXC1*inverseClock)
+MBOX2FIX Frequency [GHz] 1.E-09*MBOX2FIX/(FIXC1*inverseClock)
+MBOX3 Frequency [GHz] 1.E-09*MBOX3C0/(FIXC1*inverseClock)
+MBOX3FIX Frequency [GHz] 1.E-09*MBOX3FIX/(FIXC1*inverseClock)
+SBOX0 Frequency [GHz] 1.E-09*SBOX0C0/(FIXC1*inverseClock)
+SBOX1 Frequency [GHz] 1.E-09*SBOX1C0/(FIXC1*inverseClock)
+SBOX2 Frequency [GHz] 1.E-09*SBOX2C0/(FIXC1*inverseClock)
+BBOX0 Frequency [GHz] 1.E-09*BBOX0C0/(FIXC1*inverseClock)
+BBOX1 Frequency [GHz] 1.E-09*BBOX1C0/(FIXC1*inverseClock)
+WBOX Frequency [GHz] 1.E-09*WBOX0/(FIXC1*inverseClock)
+PBOX Frequency [GHz] 1.E-09*PBOX0/(FIXC1*inverseClock)
+RBOX0 Frequency [GHz] 1.E-09*RBOX0C0/(FIXC1*inverseClock)
+RBOX1 Frequency [GHz] 1.E-09*RBOX1C0/(FIXC1*inverseClock)
+RBOX2 Frequency [GHz] 1.E-09*RBOX2C0/(FIXC1*inverseClock)
+IBOX Frequency [GHz] 1.E-09*IBOX0/(FIXC1*inverseClock)
+
+
+LONG
+Formulas:
+UBOX Frequency [GHz] = 1.E-09*UNCORE_CLOCK/(CPU_CLK_UNHALTED_CORE*inverseClock)
+CBOX[0-14] Frequency [GHz] = 1.E-09*CBOX_CLOCKTICKS/(CPU_CLK_UNHALTED_CORE*inverseClock)
+MBOX[0-3] Frequency [GHz] = 1.E-09*DRAM_CLOCKTICKS/(CPU_CLK_UNHALTED_CORE*inverseClock)
+MBOX[0-3]FIX Frequency [GHz] = 1.E-09*DRAM_CLOCKTICKS/(CPU_CLK_UNHALTED_CORE*inverseClock)
+SBOX[0-2] Frequency [GHz] = 1.E-09*SBOX_CLOCKTICKS/(CPU_CLK_UNHALTED_CORE*inverseClock)
+BBOX[0-1] Frequency [GHz] = 1.E-09*BBOX_CLOCKTICKS/(CPU_CLK_UNHALTED_CORE*inverseClock)
+RBOX[0-2] Frequency [GHz] = 1.E-09*RBOX_CLOCKTICKS/(CPU_CLK_UNHALTED_CORE*inverseClock)
+WBOX Frequency [GHz] = 1.E-09*WBOX_CLOCKTICKS/(CPU_CLK_UNHALTED_CORE*inverseClock)
+PBOX Frequency [GHz] = 1.E-09*PBOX_CLOCKTICKS/(CPU_CLK_UNHALTED_CORE*inverseClock)
+IBOX Frequency [GHz] = 1.E-09*IBOX_CLOCKTICKS/(CPU_CLK_UNHALTED_CORE*inverseClock)
+--
+A Overview over the frequencies of all Uncore units.
diff --git a/collectors/likwid/groups/ivybridgeEP/UOPS.txt b/collectors/likwid/groups/ivybridgeEP/UOPS.txt
new file mode 100644
index 0000000..e6cc208
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/UOPS.txt
@@ -0,0 +1,35 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_ANY
+PMC1  UOPS_EXECUTED_THREAD
+PMC2  UOPS_RETIRED_ALL
+PMC3  UOPS_ISSUED_FLAGS_MERGE
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Issued UOPs PMC0
+Merged UOPs PMC3
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formulas:
+Issued UOPs = UOPS_ISSUED_ANY
+Merged UOPs = UOPS_ISSUED_FLAGS_MERGE
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs and returns the number of uOPs which were issued
+but not executed as well as the number of uOPs which were executed but never retired.
+The executed but not retired uOPs commonly come from speculatively executed branches.
+
diff --git a/collectors/likwid/groups/ivybridgeEP/UOPS_EXEC.txt b/collectors/likwid/groups/ivybridgeEP/UOPS_EXEC.txt
new file mode 100644
index 0000000..7042df7
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/UOPS_EXEC.txt
@@ -0,0 +1,31 @@
+SHORT UOPs execution
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_EXECUTED_USED_CYCLES
+PMC1  UOPS_EXECUTED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_EXECUTED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the execution stage in the pipeline. Used cycles are all cycles where uOPs are
+executed while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/collectors/likwid/groups/ivybridgeEP/UOPS_ISSUE.txt b/collectors/likwid/groups/ivybridgeEP/UOPS_ISSUE.txt
new file mode 100644
index 0000000..9aac923
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/UOPS_ISSUE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs issueing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_USED_CYCLES
+PMC1  UOPS_ISSUED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_ISSUED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the issue stage in the pipeline. Used cycles are all cycles where uOPs are
+issued while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/collectors/likwid/groups/ivybridgeEP/UOPS_RETIRE.txt b/collectors/likwid/groups/ivybridgeEP/UOPS_RETIRE.txt
new file mode 100644
index 0000000..0f37585
--- /dev/null
+++ b/collectors/likwid/groups/ivybridgeEP/UOPS_RETIRE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs retirement
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_RETIRED_USED_CYCLES
+PMC1  UOPS_RETIRED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_RETIRED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the retirement stage in the pipeline (re-order buffer). Used cycles are all
+cycles where uOPs are retired while unused cycles refer to pipeline stalls.
+Moreover, the group calculates the average stall duration in cycles.
diff --git a/collectors/likwid/groups/k10/BRANCH.txt b/collectors/likwid/groups/k10/BRANCH.txt
new file mode 100644
index 0000000..5c4207e
--- /dev/null
+++ b/collectors/likwid/groups/k10/BRANCH.txt
@@ -0,0 +1,26 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+PMC0  INSTRUCTIONS_RETIRED
+PMC1  BRANCH_RETIRED
+PMC2  BRANCH_MISPREDICT_RETIRED
+
+METRICS
+Runtime (RDTSC) [s] time
+Branch rate   PMC1/PMC0
+Branch misprediction rate  PMC2/PMC0
+Branch misprediction ratio  PMC2/PMC1
+Instructions per branch  PMC0/PMC1
+
+LONG
+Formulas:
+Branch rate = BRANCH_RETIRED/INSTRUCTIONS_RETIRED
+Branch misprediction rate = BRANCH_MISPREDICT_RETIRED/INSTRUCTIONS_RETIRED
+Branch misprediction ratio = BRANCH_MISPREDICT_RETIRED/BRANCH_RETIRED
+Instructions per branch = INSTRUCTIONS_RETIRED/BRANCH_RETIRED
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ration of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/k10/CACHE.txt b/collectors/likwid/groups/k10/CACHE.txt
new file mode 100644
index 0000000..26d799f
--- /dev/null
+++ b/collectors/likwid/groups/k10/CACHE.txt
@@ -0,0 +1,34 @@
+SHORT Data cache miss rate/ratio
+
+EVENTSET
+PMC0  INSTRUCTIONS_RETIRED
+PMC1  DATA_CACHE_ACCESSES
+PMC2  DATA_CACHE_REFILLS_L2_ALL
+PMC3  DATA_CACHE_REFILLS_NORTHBRIDGE_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+data cache misses PMC2+PMC3
+data cache request rate PMC1/PMC0
+data cache miss rate (PMC2+PMC3)/PMC0
+data cache miss ratio (PMC2+PMC3)/PMC1
+
+LONG
+Formulas:
+data cache misses = DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL
+data cache request rate = DATA_CACHE_ACCESSES / INSTRUCTIONS_RETIRED
+data cache miss rate = (DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL)/INSTRUCTIONS_RETIRED
+data cache miss ratio = (DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL)/DATA_CACHE_ACCESSES
+-
+This group measures the locality of your data accesses with regard to the
+L1 cache. Data cache request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The data cache miss rate gives a measure how often it was necessary to get
+cache lines from higher levels of the memory hierarchy. And finally
+data cache miss ratio tells you how many of your memory references required
+a cache line to be loaded from a higher level. While the# data cache miss rate
+might be given by your algorithm you should try to get data cache miss ratio
+as low as possible by increasing your cache reuse.
+This group was taken from the whitepaper -Basic Performance Measurements for AMD Athlon 64,
+AMD Opteron and AMD Phenom Processors- from Paul J. Drongowski.
+
diff --git a/collectors/likwid/groups/k10/CPI.txt b/collectors/likwid/groups/k10/CPI.txt
new file mode 100644
index 0000000..850afed
--- /dev/null
+++ b/collectors/likwid/groups/k10/CPI.txt
@@ -0,0 +1,26 @@
+SHORT  Cycles per instruction
+
+EVENTSET
+PMC0  INSTRUCTIONS_RETIRED
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  UOPS_RETIRED
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]  PMC1*inverseClock
+CPI   PMC1/PMC0
+CPI (based on uops)   PMC1/PMC2
+IPC   PMC0/PMC1
+
+LONG
+Formulas:
+CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
+CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS
+IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED
+-
+This group measures how efficient the processor works with
+regard to instruction throughput. Also important as a standalone
+metric is INSTRUCTIONS_RETIRED as it tells you how many instruction
+you need to execute for a task. An optimization might show very
+low CPI values but execute many more instruction for it.
+
diff --git a/collectors/likwid/groups/k10/FLOPS_DP.txt b/collectors/likwid/groups/k10/FLOPS_DP.txt
new file mode 100644
index 0000000..89f0ac2
--- /dev/null
+++ b/collectors/likwid/groups/k10/FLOPS_DP.txt
@@ -0,0 +1,24 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+PMC0  SSE_RETIRED_ADD_DOUBLE_FLOPS
+PMC1  SSE_RETIRED_MULT_DOUBLE_FLOPS
+PMC2  CPU_CLOCKS_UNHALTED
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] PMC2*inverseClock
+DP [MFLOP/s]    1.0E-06*(PMC0+PMC1)/time
+DP Add [MFLOP/s]    1.0E-06*PMC0/time
+DP Mult [MFLOP/s]    1.0E-06*PMC1/time
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(SSE_RETIRED_ADD_DOUBLE_FLOPS+SSE_RETIRED_MULT_DOUBLE_FLOPS)/time
+DP Add [MFLOP/s] = 1.0E-06*(SSE_RETIRED_ADD_DOUBLE_FLOPS)/time
+DP Mult [MFLOP/s] = 1.0E-06*(SSE_RETIRED_MULT_DOUBLE_FLOPS)/time
+-
+Profiling group to measure double SSE FLOPs.
+Don't forget that your code might also execute X87 FLOPs.
+
+
diff --git a/collectors/likwid/groups/k10/FLOPS_SP.txt b/collectors/likwid/groups/k10/FLOPS_SP.txt
new file mode 100644
index 0000000..590d39a
--- /dev/null
+++ b/collectors/likwid/groups/k10/FLOPS_SP.txt
@@ -0,0 +1,24 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+PMC0  SSE_RETIRED_ADD_SINGLE_FLOPS
+PMC1  SSE_RETIRED_MULT_SINGLE_FLOPS
+PMC2  CPU_CLOCKS_UNHALTED
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] PMC2*inverseClock
+SP [MFLOP/s]  1.0E-06*(PMC0+PMC1)/time
+SP Add [MFLOP/s]  1.0E-06*PMC0/time
+SP Mult [MFLOP/s]   1.0E-06*PMC1/time
+
+LONG
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(SSE_RETIRED_ADD_SINGLE_FLOPS+SSE_RETIRED_MULT_SINGLE_FLOPS)/time
+SP Add [MFLOP/s] = 1.0E-06*(SSE_RETIRED_ADD_SINGLE_FLOPS)/time
+SP Mult [MFLOP/s] = 1.0E-06*(SSE_RETIRED_MULT_SINGLE_FLOPS)/time
+-
+Profiling group to measure single precision SSE FLOPs.
+Don't forget that your code might also execute X87 FLOPs.
+
+
diff --git a/collectors/likwid/groups/k10/FLOPS_X87.txt b/collectors/likwid/groups/k10/FLOPS_X87.txt
new file mode 100644
index 0000000..62fbefc
--- /dev/null
+++ b/collectors/likwid/groups/k10/FLOPS_X87.txt
@@ -0,0 +1,25 @@
+SHORT X87 MFLOP/s
+
+EVENTSET
+PMC0  X87_FLOPS_RETIRED_ADD
+PMC1  X87_FLOPS_RETIRED_MULT
+PMC2  X87_FLOPS_RETIRED_DIV
+PMC3  CPU_CLOCKS_UNHALTED
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] PMC3*inverseClock
+X87 [MFLOP/s]       1.0E-06*(PMC0+PMC1+PMC2)/time
+X87 Add [MFLOP/s]    1.0E-06*PMC0/time
+X87 Mult [MFLOP/s]   1.0E-06*PMC1/time
+X87 Div [MFLOP/s]    1.0E-06*PMC2/time
+
+LONG
+Formulas:
+X87 [MFLOP/s] = 1.0E-06*(X87_FLOPS_RETIRED_ADD+X87_FLOPS_RETIRED_MULT+X87_FLOPS_RETIRED_DIV)/time
+X87 Add [MFLOP/s] = 1.0E-06*X87_FLOPS_RETIRED_ADD/time
+X87 Mult [MFLOP/s] = 1.0E-06*X87_FLOPS_RETIRED_MULT/time
+X87 Div [MFLOP/s] = 1.0E-06*X87_FLOPS_RETIRED_DIV/time
+-
+Profiling group to measure X87 FLOP rates.
+
diff --git a/collectors/likwid/groups/k10/FPU_EXCEPTION.txt b/collectors/likwid/groups/k10/FPU_EXCEPTION.txt
new file mode 100644
index 0000000..23d3c54
--- /dev/null
+++ b/collectors/likwid/groups/k10/FPU_EXCEPTION.txt
@@ -0,0 +1,21 @@
+SHORT   Floating point exceptions
+
+EVENTSET
+PMC0  INSTRUCTIONS_RETIRED
+PMC1  FP_INSTRUCTIONS_RETIRED_ALL
+PMC2  FPU_EXCEPTIONS_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Overall FP exception rate  PMC2/PMC0
+FP exception rate    PMC2/PMC1
+
+LONG
+Formulas:
+Overall FP exception rate = FPU_EXCEPTIONS_ALL / INSTRUCTIONS_RETIRED
+FP exception rate = FPU_EXCEPTIONS_ALL / FP_INSTRUCTIONS_RETIRED_ALL
+-
+Floating point exceptions occur e.g. on the treatment of denormal numbers.
+There might be a large penalty if there are too many floating point
+exceptions.
+
diff --git a/collectors/likwid/groups/k10/ICACHE.txt b/collectors/likwid/groups/k10/ICACHE.txt
new file mode 100644
index 0000000..5150496
--- /dev/null
+++ b/collectors/likwid/groups/k10/ICACHE.txt
@@ -0,0 +1,23 @@
+SHORT Instruction cache miss rate/ratio
+
+EVENTSET
+PMC0  INSTRUCTIONS_RETIRED
+PMC1  ICACHE_FETCHES
+PMC2  ICACHE_REFILLS_L2
+PMC3  ICACHE_REFILLS_MEM
+
+METRICS
+Runtime (RDTSC) [s] time
+L1I request rate   PMC1/PMC0
+L1I miss rate    (PMC2+PMC3)/PMC0
+L1I miss ratio   (PMC2+PMC3)/PMC1
+
+LONG
+Formulas:
+L1I request rate = ICACHE_FETCHES / INSTRUCTIONS_RETIRED
+L1I miss rate = (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/INSTRUCTIONS_RETIRED
+L1I miss ratio = (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/ICACHE_FETCHES
+-
+This group measures the locality of your instruction code with regard to the
+L1 I-Cache.
+
diff --git a/collectors/likwid/groups/k10/L2.txt b/collectors/likwid/groups/k10/L2.txt
new file mode 100644
index 0000000..fae6fb0
--- /dev/null
+++ b/collectors/likwid/groups/k10/L2.txt
@@ -0,0 +1,33 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  DATA_CACHE_REFILLS_L2_ALL
+PMC1  DATA_CACHE_EVICTED_ALL
+PMC2  CPU_CLOCKS_UNHALTED
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   PMC2*inverseClock
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s]   1.0E-06*(PMC0+PMC1)*64.0/time
+L2 data volume [GBytes]   1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*DATA_CACHE_REFILLS_L2_ALL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*DATA_CACHE_REFILLS_L2_ALL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*DATA_CACHE_EVICTED_ALL*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*DATA_CACHE_EVICTED_ALL*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(DATA_CACHE_REFILLS_L2_ALL+DATA_CACHE_EVICTED_ALL)*64/time
+L2 data volume [GBytes] = 1.0E-09*(DATA_CACHE_REFILLS_L2_ALL+DATA_CACHE_EVICTED_ALL)*64
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is
+computed by the number of cache line loaded from L2 to L1 and the
+number of modified cache lines evicted from the L1.
+Note that this bandwidth also includes data transfers due to a
+write allocate load on a store miss in L1 and copy back transfers if
+originated from L2.
+
diff --git a/collectors/likwid/groups/k10/L2CACHE.txt b/collectors/likwid/groups/k10/L2CACHE.txt
new file mode 100644
index 0000000..2d29e43
--- /dev/null
+++ b/collectors/likwid/groups/k10/L2CACHE.txt
@@ -0,0 +1,32 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+PMC0  INSTRUCTIONS_RETIRED
+PMC1  L2_REQUESTS_ALL
+PMC2  L2_MISSES_ALL
+PMC3  L2_FILL_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+L2 request rate   (PMC1+PMC3)/PMC0
+L2 miss rate   PMC2/PMC0
+L2 miss ratio   PMC2/(PMC1+PMC3)
+
+LONG
+Formulas:
+L2 request rate = (L2_REQUESTS_ALL+L2_FILL_ALL)/INSTRUCTIONS_RETIRED
+L2 miss rate  = L2_MISSES_ALL/INSTRUCTIONS_RETIRED
+L2 miss ratio = L2_MISSES_ALL/(L2_REQUESTS_ALL+L2_FILL_ALL)
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+This group was taken from the whitepaper -Basic Performance Measurements for AMD Athlon 64,
+AMD Opteron and AMD Phenom Processors- from Paul J. Drongowski.
+
+
diff --git a/collectors/likwid/groups/k10/MEM.txt b/collectors/likwid/groups/k10/MEM.txt
new file mode 100644
index 0000000..f9f5a91
--- /dev/null
+++ b/collectors/likwid/groups/k10/MEM.txt
@@ -0,0 +1,35 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+PMC0  NORTHBRIDGE_READ_RESPONSE_ALL
+PMC1  OCTWORDS_WRITE_TRANSFERS
+PMC2  DRAM_ACCESSES_DCTO_ALL
+PMC3  DRAM_ACCESSES_DCT1_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Memory read bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+Memory read data volume [GBytes]  1.0E-09*PMC0*64.0
+Memory write bandwidth [MBytes/s]  1.0E-06*PMC1*8.0/time
+Memory write data volume [GBytes]  1.0E-09*PMC1*8.0
+Memory bandwidth [MBytes/s]   1.0E-06*(PMC2+PMC3)*64.0/time
+Memory data volume [GBytes]   1.0E-09*(PMC2+PMC3)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*NORTHBRIDGE_READ_RESPONSE_ALL*64/time
+Memory read data volume [GBytes] = 1.0E-09*NORTHBRIDGE_READ_RESPONSE_ALL*64
+Memory write bandwidth [MBytes/s] = 1.0E-06*OCTWORDS_WRITE_TRANSFERS*8/time
+Memory write data volume [GBytes] = 1.0E-09*OCTWORDS_WRITE_TRANSFERS*8
+Memory bandwidth [MBytes/s] = 1.0E-06*(DRAM_ACCESSES_DCTO_ALL+DRAM_ACCESSES_DCT1_ALL)*64/time
+Memory data volume [GBytes] = 1.0E-09*(DRAM_ACCESSES_DCTO_ALL+DRAM_ACCESSES_DCT1_ALL)*64
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Note: As this group measures the accesses from all cores it only makes sense
+to measure with one core per socket, similar as with the Intel Nehalem Uncore events.
+The memory read bandwidth contains all data from DRAM, L3, or another cache,
+including another core on the same node. The event OCTWORDS_WRITE_TRANSFERS counts
+16 Byte transfers, not 64 Byte.
+
+
+
diff --git a/collectors/likwid/groups/k10/NUMA_0_3.txt b/collectors/likwid/groups/k10/NUMA_0_3.txt
new file mode 100644
index 0000000..66e56d9
--- /dev/null
+++ b/collectors/likwid/groups/k10/NUMA_0_3.txt
@@ -0,0 +1,27 @@
+SHORT Bandwidth on the Hypertransport links
+
+EVENTSET
+PMC0  CPU_TO_DRAM_LOCAL_TO_0
+PMC1  CPU_TO_DRAM_LOCAL_TO_1
+PMC2  CPU_TO_DRAM_LOCAL_TO_2
+PMC3  CPU_TO_DRAM_LOCAL_TO_3
+
+METRICS
+Runtime (RDTSC) [s] time
+Hyper Transport link0 bandwidth [MBytes/s]  1.0E-06*PMC0*4.0/time
+Hyper Transport link1 bandwidth [MBytes/s]  1.0E-06*PMC1*4.0/time
+Hyper Transport link2 bandwidth [MBytes/s]  1.0E-06*PMC2*4.0/time
+Hyper Transport link3 bandwidth [MBytes/s]  1.0E-06*PMC3*4.0/time
+
+LONG
+Formulas:
+Hyper Transport link0 bandwidth [MBytes/s]  = 1.0E-06*CPU_TO_DRAM_LOCAL_TO_0*4.0/time
+Hyper Transport link1 bandwidth [MBytes/s]  = 1.0E-06*CPU_TO_DRAM_LOCAL_TO_1*4.0/time
+Hyper Transport link2 bandwidth [MBytes/s]  = 1.0E-06*CPU_TO_DRAM_LOCAL_TO_2*4.0/time
+Hyper Transport link3 bandwidth [MBytes/s]  = 1.0E-06*CPU_TO_DRAM_LOCAL_TO_3*4.0/time
+-
+Profiling group to measure the bandwidth over the Hypertransport links. Can be used
+to detect NUMA problems. Usually there should be only limited traffic over the QPI
+links for optimal performance.
+
+
diff --git a/collectors/likwid/groups/k10/NUMA_4_7.txt b/collectors/likwid/groups/k10/NUMA_4_7.txt
new file mode 100644
index 0000000..e13f2b9
--- /dev/null
+++ b/collectors/likwid/groups/k10/NUMA_4_7.txt
@@ -0,0 +1,27 @@
+SHORT Bandwidth on the Hypertransport links
+
+EVENTSET
+PMC0  CPU_TO_DRAM_LOCAL_TO_4
+PMC1  CPU_TO_DRAM_LOCAL_TO_5
+PMC2  CPU_TO_DRAM_LOCAL_TO_6
+PMC3  CPU_TO_DRAM_LOCAL_TO_7
+
+METRICS
+Runtime (RDTSC) [s] time
+Hyper Transport link4 bandwidth [MBytes/s]  1.0E-06*PMC0*4.0/time
+Hyper Transport link5 bandwidth [MBytes/s]  1.0E-06*PMC1*4.0/time
+Hyper Transport link6 bandwidth [MBytes/s]  1.0E-06*PMC2*4.0/time
+Hyper Transport link7 bandwidth [MBytes/s]  1.0E-06*PMC3*4.0/time
+
+LONG
+Formulas:
+Hyper Transport link4 bandwidth [MBytes/s]  = 1.0E-06*CPU_TO_DRAM_LOCAL_TO_0*4.0/time
+Hyper Transport link5 bandwidth [MBytes/s]  = 1.0E-06*CPU_TO_DRAM_LOCAL_TO_1*4.0/time
+Hyper Transport link6 bandwidth [MBytes/s]  = 1.0E-06*CPU_TO_DRAM_LOCAL_TO_2*4.0/time
+Hyper Transport link7 bandwidth [MBytes/s]  = 1.0E-06*CPU_TO_DRAM_LOCAL_TO_3*4.0/time
+-
+Profiling group to measure the bandwidth over the Hypertransport links. Can be used
+to detect NUMA problems. Usually there should be only limited traffic over the QPI
+links for optimal performance.
+
+
diff --git a/collectors/likwid/groups/k10/TLB.txt b/collectors/likwid/groups/k10/TLB.txt
new file mode 100644
index 0000000..25cab33
--- /dev/null
+++ b/collectors/likwid/groups/k10/TLB.txt
@@ -0,0 +1,35 @@
+SHORT  TLB miss rate/ratio
+
+EVENTSET
+PMC0  INSTRUCTIONS_RETIRED
+PMC1  DATA_CACHE_ACCESSES
+PMC2  DTLB_L2_HIT_ALL
+PMC3  DTLB_L2_MISS_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+L1 DTLB request rate  PMC1/PMC0
+L1 DTLB miss rate   (PMC2+PMC3)/PMC0
+L1 DTLB miss ratio   (PMC2+PMC3)/PMC1
+L2 DTLB request rate   (PMC2+PMC3)/PMC0
+L2 DTLB miss rate    PMC3/PMC0
+L2 DTLB miss ratio    PMC3/(PMC2+PMC3)
+
+
+LONG
+Formulas:
+L1 DTLB request rate = DATA_CACHE_ACCESSES / INSTRUCTIONS_RETIRED
+L1 DTLB miss rate = (DTLB_L2_HIT_ALL+DTLB_L2_MISS_ALL)/INSTRUCTIONS_RETIRED
+L1 DTLB miss ratio = (DTLB_L2_HIT_ALL+DTLB_L2_MISS_ALL)/DATA_CACHE_ACCESSES
+L2 DTLB request rate = (DTLB_L2_HIT_ALL+DTLB_L2_MISS_ALL)/INSTRUCTIONS_RETIRED
+L2 DTLB miss rate = DTLB_L2_MISS_ALL / INSTRUCTIONS_RETIRED
+L2 DTLB miss ratio = DTLB_L2_MISS_ALL / (DTLB_L2_HIT_ALL+DTLB_L2_MISS_ALL)
+-
+L1 DTLB request  rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The DTLB miss  rate gives a measure how often a TLB miss occurred
+per instruction. And finally L1 DTLB  miss ratio tells you how many
+of your memory references required caused a TLB miss on average.
+NOTE: The L2 metrics are only relevant if L2 DTLB request rate is equal to the L1 DTLB miss rate!
+This group was taken from the whitepaper Basic -Performance Measurements for AMD Athlon 64,
+AMD Opteron and AMD Phenom Processors- from Paul J. Drongowski.
diff --git a/collectors/likwid/groups/k8/BRANCH.txt b/collectors/likwid/groups/k8/BRANCH.txt
new file mode 100644
index 0000000..f465335
--- /dev/null
+++ b/collectors/likwid/groups/k8/BRANCH.txt
@@ -0,0 +1,25 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+PMC0  INSTRUCTIONS_RETIRED
+PMC1  BRANCH_RETIRED
+PMC2  BRANCH_MISPREDICT_RETIRED
+
+METRICS
+Runtime (RDTSC) [s] time
+Branch rate   PMC1/PMC0
+Branch misprediction rate  PMC2/PMC0
+Branch misprediction ratio  PMC2/PMC1
+Instructions per branch  PMC0/PMC1
+
+LONG
+Formulas:
+Branch rate = BRANCH_RETIRED/INSTRUCTIONS_RETIRED
+Branch misprediction rate = BRANCH_MISPREDICT_RETIRED/INSTRUCTIONS_RETIRED
+Branch misprediction ratio = BRANCH_MISPREDICT_RETIRED/BRANCH_RETIRED
+Instructions per branch = INSTRUCTIONS_RETIRED/BRANCH_RETIRED
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ration of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
diff --git a/collectors/likwid/groups/k8/CACHE.txt b/collectors/likwid/groups/k8/CACHE.txt
new file mode 100644
index 0000000..e5e813e
--- /dev/null
+++ b/collectors/likwid/groups/k8/CACHE.txt
@@ -0,0 +1,33 @@
+SHORT Data cache miss rate/ratio
+
+EVENTSET
+PMC0  INSTRUCTIONS_RETIRED
+PMC1  DATA_CACHE_ACCESSES
+PMC2  DATA_CACHE_REFILLS_L2_ALL
+PMC3  DATA_CACHE_REFILLS_NORTHBRIDGE_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+data cache misses PMC2+PMC3
+data cache request rate PMC1/PMC0
+data cache miss rate (PMC2+PMC3)/PMC0
+data cache miss ratio (PMC2+PMC3)/PMC1
+
+LONG
+Formulas:
+data cache misses = DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL
+data cache request rate = DATA_CACHE_ACCESSES / INSTRUCTIONS_RETIRED
+data cache miss rate = (DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL)/INSTRUCTIONS_RETIRED
+data cache miss ratio = (DATA_CACHE_REFILLS_L2_AL + DATA_CACHE_REFILLS_NORTHBRIDGE_ALL)/DATA_CACHE_ACCESSES
+-
+This group measures the locality of your data accesses with regard to the
+L1 cache. Data cache request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The data cache miss rate gives a measure how often it was necessary to get
+cache lines from higher levels of the memory hierarchy. And finally
+data cache miss ratio tells you how many of your memory references required
+a cache line to be loaded from a higher level. While the# data cache miss rate
+might be given by your algorithm you should try to get data cache miss ratio
+as low as possible by increasing your cache reuse.
+This group was taken from the whitepaper -Basic Performance Measurements for AMD Athlon 64,
+AMD Opteron and AMD Phenom Processors- from Paul J. Drongowski.
diff --git a/collectors/likwid/groups/k8/CPI.txt b/collectors/likwid/groups/k8/CPI.txt
new file mode 100644
index 0000000..850afed
--- /dev/null
+++ b/collectors/likwid/groups/k8/CPI.txt
@@ -0,0 +1,26 @@
+SHORT  Cycles per instruction
+
+EVENTSET
+PMC0  INSTRUCTIONS_RETIRED
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  UOPS_RETIRED
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]  PMC1*inverseClock
+CPI   PMC1/PMC0
+CPI (based on uops)   PMC1/PMC2
+IPC   PMC0/PMC1
+
+LONG
+Formulas:
+CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
+CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS
+IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED
+-
+This group measures how efficient the processor works with
+regard to instruction throughput. Also important as a standalone
+metric is INSTRUCTIONS_RETIRED as it tells you how many instruction
+you need to execute for a task. An optimization might show very
+low CPI values but execute many more instruction for it.
+
diff --git a/collectors/likwid/groups/k8/ICACHE.txt b/collectors/likwid/groups/k8/ICACHE.txt
new file mode 100644
index 0000000..5150496
--- /dev/null
+++ b/collectors/likwid/groups/k8/ICACHE.txt
@@ -0,0 +1,23 @@
+SHORT Instruction cache miss rate/ratio
+
+EVENTSET
+PMC0  INSTRUCTIONS_RETIRED
+PMC1  ICACHE_FETCHES
+PMC2  ICACHE_REFILLS_L2
+PMC3  ICACHE_REFILLS_MEM
+
+METRICS
+Runtime (RDTSC) [s] time
+L1I request rate   PMC1/PMC0
+L1I miss rate    (PMC2+PMC3)/PMC0
+L1I miss ratio   (PMC2+PMC3)/PMC1
+
+LONG
+Formulas:
+L1I request rate = ICACHE_FETCHES / INSTRUCTIONS_RETIRED
+L1I miss rate = (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/INSTRUCTIONS_RETIRED
+L1I miss ratio = (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/ICACHE_FETCHES
+-
+This group measures the locality of your instruction code with regard to the
+L1 I-Cache.
+
diff --git a/collectors/likwid/groups/k8/L2.txt b/collectors/likwid/groups/k8/L2.txt
new file mode 100644
index 0000000..63b9b7f
--- /dev/null
+++ b/collectors/likwid/groups/k8/L2.txt
@@ -0,0 +1,31 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  DATA_CACHE_REFILLS_L2_ALL
+PMC1  DATA_CACHE_EVICTED_ALL
+PMC2  CPU_CLOCKS_UNHALTED
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   PMC2*inverseClock
+L2 bandwidth [MBytes/s]   1.0E-06*(PMC0+PMC1)*64.0/time
+L2 data volume [GBytes]   1.0E-09*(PMC0+PMC1)*64.0
+L2 refill bandwidth [MBytes/s]   1.0E-06*PMC0*64.0/time
+L2 evict  [MBytes/s]    1.0E-06*PMC1*64.0/time
+
+LONG
+Formulas:
+L2 bandwidth [MBytes/s]   = 1.0E-06*(DATA_CACHE_REFILLS_L2_ALL+DATA_CACHE_EVICTED_ALL)*64/time
+L2 data volume [GBytes]   = 1.0E-09*(DATA_CACHE_REFILLS_L2_ALL+DATA_CACHE_EVICTED_ALL)*64
+L2 refill bandwidth [MBytes/s]   = 1.0E-06*DATA_CACHE_REFILLS_L2_ALL*64/time
+L2 evict [MBytes/s]   = 1.0E-06*DATA_CACHE_EVICTED_ALL*64/time
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is
+computed by the number of cache line loaded from L2 to L1 and the
+number of modified cache lines evicted from the L1.
+Note that this bandwidth also includes data transfers due to a
+write allocate load on a store miss in L1 and copy back transfers if
+originated from L2.
+
+
+
diff --git a/collectors/likwid/groups/kabini/BRANCH.txt b/collectors/likwid/groups/kabini/BRANCH.txt
new file mode 100644
index 0000000..7495b74
--- /dev/null
+++ b/collectors/likwid/groups/kabini/BRANCH.txt
@@ -0,0 +1,26 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  RETIRED_BRANCH_INSTR
+PMC2  RETIRED_MISPREDICTED_BRANCH_INSTR
+
+METRICS
+Runtime (RDTSC) [s] time
+Branch rate   PMC1/PMC0
+Branch misprediction rate  PMC2/PMC0
+Branch misprediction ratio  PMC2/PMC1
+Instructions per branch  PMC0/PMC1
+
+LONG
+Formulas:
+Branch rate = RETIRED_BRANCH_INSTR/RETIRED_INSTRUCTIONS
+Branch misprediction rate = RETIRED_MISPREDICTED_BRANCH_INSTR/RETIRED_INSTRUCTIONS
+Branch misprediction ratio = RETIRED_MISPREDICTED_BRANCH_INSTR/RETIRED_BRANCH_INSTR
+Instructions per branch = RETIRED_INSTRUCTIONS/RETIRED_BRANCH_INSTR
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/kabini/CACHE.txt b/collectors/likwid/groups/kabini/CACHE.txt
new file mode 100644
index 0000000..8a59288
--- /dev/null
+++ b/collectors/likwid/groups/kabini/CACHE.txt
@@ -0,0 +1,32 @@
+SHORT Data cache miss rate/ratio
+
+EVENTSET
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  DATA_CACHE_ACCESSES
+PMC2  DATA_CACHE_REFILLS_ALL
+PMC3  DATA_CACHE_REFILLS_NB_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+data cache misses PMC2+PMC3
+data cache request rate PMC1/PMC0
+data cache miss rate (PMC2+PMC3)/PMC0
+data cache miss ratio (PMC2+PMC3)/PMC1
+
+LONG
+Formulas:
+data cache misses = DATA_CACHE_REFILLS_ALL + DATA_CACHE_REFILLS_NB_ALL
+data cache request rate = DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS
+data cache miss rate = (DATA_CACHE_REFILLS_ALL + DATA_CACHE_REFILLS_NB_ALL)/RETIRED_INSTRUCTIONS
+data cache miss ratio = (DATA_CACHE_REFILLS_ALL + DATA_CACHE_REFILLS_NB_ALL)/DATA_CACHE_ACCESSES
+-
+This group measures the locality of your data accesses with regard to the
+L1 cache. Data cache request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The data cache miss rate gives a measure how often it was necessary to get
+cache lines from higher levels of the memory hierarchy. And finally
+data cache miss ratio tells you how many of your memory references required
+a cache line to be loaded from a higher level. While the# data cache miss rate
+might be given by your algorithm you should try to get data cache miss ratio
+as low as possible by increasing your cache reuse.
+
diff --git a/collectors/likwid/groups/kabini/CPI.txt b/collectors/likwid/groups/kabini/CPI.txt
new file mode 100644
index 0000000..c0746e7
--- /dev/null
+++ b/collectors/likwid/groups/kabini/CPI.txt
@@ -0,0 +1,26 @@
+SHORT  Cycles per instruction
+
+EVENTSET
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  RETIRED_UOPS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]  PMC1*inverseClock
+CPI   PMC1/PMC0
+CPI (based on uops)   PMC1/PMC2
+IPC   PMC0/PMC1
+
+LONG
+Formulas:
+CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
+CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS
+IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED
+-
+This group measures how efficient the processor works with
+regard to instruction throughput. Also important as a standalone
+metric is RETIRED_INSTRUCTIONS as it tells you how many instruction
+you need to execute for a task. An optimization might show very
+low CPI values but execute many more instruction for it.
+
diff --git a/collectors/likwid/groups/kabini/DATA.txt b/collectors/likwid/groups/kabini/DATA.txt
new file mode 100644
index 0000000..75f1f60
--- /dev/null
+++ b/collectors/likwid/groups/kabini/DATA.txt
@@ -0,0 +1,16 @@
+SHORT Load to store ratio
+
+EVENTSET
+PMC0  LS_DISPATCH_LOADS
+PMC1  LS_DISPATCH_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = LS_DISPATCH_LOADS/LS_DISPATCH_STORES
+-
+This is a simple metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/kabini/FLOPS_DP.txt b/collectors/likwid/groups/kabini/FLOPS_DP.txt
new file mode 100644
index 0000000..1a4e54c
--- /dev/null
+++ b/collectors/likwid/groups/kabini/FLOPS_DP.txt
@@ -0,0 +1,26 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  RETIRED_UOPS
+PMC3  RETIRED_FLOPS_DOUBLE_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]  PMC1*inverseClock
+DP [MFLOP/s]    1.0E-06*(PMC3)/time
+CPI   PMC1/PMC0
+CPI (based on uops)   PMC1/PMC2
+IPC   PMC0/PMC1
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(RETIRED_FLOPS_DOUBLE_ALL)/time
+CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
+CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS
+IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED
+-
+Profiling group to measure double precisision FLOP rate.
+
+
diff --git a/collectors/likwid/groups/kabini/FLOPS_SP.txt b/collectors/likwid/groups/kabini/FLOPS_SP.txt
new file mode 100644
index 0000000..f6c08c1
--- /dev/null
+++ b/collectors/likwid/groups/kabini/FLOPS_SP.txt
@@ -0,0 +1,26 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  RETIRED_UOPS
+PMC3  RETIRED_FLOPS_SINGLE_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]  PMC1*inverseClock
+SP [MFLOP/s]    1.0E-06*(PMC3)/time
+CPI   PMC1/PMC0
+CPI (based on uops)   PMC1/PMC2
+IPC   PMC0/PMC1
+
+LONG
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(RETIRED_FLOPS_SINGLE_ALL)/time
+CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
+CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS
+IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED
+-
+Profiling group to measure single precision FLOP rate.
+
+
diff --git a/collectors/likwid/groups/kabini/FPU_EXCEPTION.txt b/collectors/likwid/groups/kabini/FPU_EXCEPTION.txt
new file mode 100644
index 0000000..5ed02c6
--- /dev/null
+++ b/collectors/likwid/groups/kabini/FPU_EXCEPTION.txt
@@ -0,0 +1,21 @@
+SHORT   Floating point exceptions
+
+EVENTSET
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  RETIRED_FP_INSTRUCTIONS_ALL
+PMC2  FPU_EXCEPTION_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Overall FP exception rate  PMC2/PMC0
+FP exception rate    PMC2/PMC1
+
+LONG
+Formulas:
+Overall FP exception rate = FPU_EXCEPTIONS_ALL / RETIRED_INSTRUCTIONS
+FP exception rate = FPU_EXCEPTIONS_ALL / FP_INSTRUCTIONS_RETIRED_ALL
+-
+Floating point exceptions occur e.g. on the treatment of denormal numbers.
+There might be a large penalty if there are too many floating point
+exceptions.
+
diff --git a/collectors/likwid/groups/kabini/ICACHE.txt b/collectors/likwid/groups/kabini/ICACHE.txt
new file mode 100644
index 0000000..62b91d6
--- /dev/null
+++ b/collectors/likwid/groups/kabini/ICACHE.txt
@@ -0,0 +1,23 @@
+SHORT Instruction cache miss rate/ratio
+
+EVENTSET
+PMC0  INSTRUCTION_CACHE_FETCHES
+PMC1  INSTRUCTION_CACHE_L2_REFILLS
+PMC2  INSTRUCTION_CACHE_SYSTEM_REFILLS
+PMC3  RETIRED_INSTRUCTIONS
+
+METRICS
+Runtime (RDTSC) [s] time
+L1I request rate   PMC0/PMC3
+L1I miss rate    (PMC1+PMC2)/PMC3
+L1I miss ratio   (PMC1+PMC2)/PMC0
+
+LONG
+Formulas:
+L1I request rate = INSTRUCTION_CACHE_FETCHES / RETIRED_INSTRUCTIONS
+L1I miss rate = (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/RETIRED_INSTRUCTIONS
+L1I miss ratio = (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/INSTRUCTION_CACHE_FETCHES
+-
+This group measures the locality of your instruction code with regard to the
+L1 I-Cache.
+
diff --git a/collectors/likwid/groups/kabini/L2.txt b/collectors/likwid/groups/kabini/L2.txt
new file mode 100644
index 0000000..3598a54
--- /dev/null
+++ b/collectors/likwid/groups/kabini/L2.txt
@@ -0,0 +1,33 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  DATA_CACHE_REFILLS_ALL
+PMC1  DATA_CACHE_EVICTED_ALL
+PMC2  CPU_CLOCKS_UNHALTED
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   PMC2*inverseClock
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s]   1.0E-06*(PMC0+PMC1)*64.0/time
+L2 data volume [GBytes]   1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*DATA_CACHE_REFILLS_ALL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*DATA_CACHE_REFILLS_ALL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*DATA_CACHE_EVICTED_ALL*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*DATA_CACHE_EVICTED_ALL*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(DATA_CACHE_REFILLS_ALL+DATA_CACHE_EVICTED_ALL)*64/time
+L2 data volume [GBytes] = 1.0E-09*(DATA_CACHE_REFILLS_ALL+DATA_CACHE_EVICTED_ALL)*64
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is
+computed by the number of cache line loaded from L2 to L1 and the
+number of modified cache lines evicted from the L1.
+Note that this bandwidth also includes data transfers due to a
+write allocate load on a store miss in L1 and copy back transfers if
+originated from L2.
+
diff --git a/collectors/likwid/groups/kabini/MEM.txt b/collectors/likwid/groups/kabini/MEM.txt
new file mode 100644
index 0000000..2fa9dfe
--- /dev/null
+++ b/collectors/likwid/groups/kabini/MEM.txt
@@ -0,0 +1,20 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+UPMC0  UNC_DRAM_ACCESSES_DCT0_ALL
+UPMC1  UNC_DRAM_ACCESSES_DCT1_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Memory bandwidth [MBytes/s]   1.0E-06*(UPMC0+UPMC1)*64.0/time
+Memory data volume [GBytes]   1.0E-09*(UPMC0+UPMC1)*64.0
+
+LONG
+Formulas:
+Memory bandwidth [MBytes/s] = 1.0E-06*(DRAM_ACCESSES_DCTO_ALL+DRAM_ACCESSES_DCT1_ALL)*64/time
+Memory data volume [GBytes] = 1.0E-09*(DRAM_ACCESSES_DCTO_ALL+DRAM_ACCESSES_DCT1_ALL)*64
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Note: As this group measures the accesses from all cores it only makes sense
+to measure with one core per socket, similar as with the Intel Nehalem Uncore events.
+
diff --git a/collectors/likwid/groups/kabini/NUMA_0_3.txt b/collectors/likwid/groups/kabini/NUMA_0_3.txt
new file mode 100644
index 0000000..79f3618
--- /dev/null
+++ b/collectors/likwid/groups/kabini/NUMA_0_3.txt
@@ -0,0 +1,28 @@
+SHORT Read/Write Events between the ccNUMA nodes
+
+EVENTSET
+UPMC0  UNC_CPU_TO_DRAM_LOCAL_TO_0
+UPMC1  UNC_CPU_TO_DRAM_LOCAL_TO_1
+UPMC2  UNC_CPU_TO_DRAM_LOCAL_TO_2
+UPMC3  UNC_CPU_TO_DRAM_LOCAL_TO_3
+
+METRICS
+Runtime (RDTSC) [s] time
+DRAM read/write local to 0 [MegaEvents/s]  1.0E-06*UPMC0/time
+DRAM read/write local to 1 [MegaEvents/s]  1.0E-06*UPMC1/time
+DRAM read/write local to 2 [MegaEvents/s]  1.0E-06*UPMC2/time
+DRAM read/write local to 3 [MegaEvents/s]  1.0E-06*UPMC3/time
+
+LONG
+Formulas:
+DRAM read/write local to 0 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_0/time
+DRAM read/write local to 1 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_1/time
+DRAM read/write local to 2 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_2/time
+DRAM read/write local to 3 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_3/time
+-
+Profiling group to measure the traffic from local CPU to the different
+DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded
+code. You must first determine on which memory domains your code is running.
+A code should only have significant traffic to its own memory domain.
+
+
diff --git a/collectors/likwid/groups/kabini/NUMA_4_7.txt b/collectors/likwid/groups/kabini/NUMA_4_7.txt
new file mode 100644
index 0000000..7b518db
--- /dev/null
+++ b/collectors/likwid/groups/kabini/NUMA_4_7.txt
@@ -0,0 +1,28 @@
+SHORT Read/Write Events between the ccNUMA nodes
+
+EVENTSET
+UPMC0  UNC_CPU_TO_DRAM_LOCAL_TO_4
+UPMC1  UNC_CPU_TO_DRAM_LOCAL_TO_5
+UPMC2  UNC_CPU_TO_DRAM_LOCAL_TO_6
+UPMC3  UNC_CPU_TO_DRAM_LOCAL_TO_7
+
+METRICS
+Runtime (RDTSC) [s] time
+DRAM read/write local to 4 [MegaEvents/s]  1.0E-06*UPMC0/time
+DRAM read/write local to 5 [MegaEvents/s]  1.0E-06*UPMC1/time
+DRAM read/write local to 6 [MegaEvents/s]  1.0E-06*UPMC2/time
+DRAM read/write local to 7 [MegaEvents/s]  1.0E-06*UPMC3/time
+
+LONG
+Formulas:
+DRAM read/write local to 4 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_4/time
+DRAM read/write local to 5 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_5/time
+DRAM read/write local to 6 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_6/time
+DRAM read/write local to 7 [MegaEvents/s]  = 1.0E-06*UNC_CPU_TO_DRAM_LOCAL_TO_7/time
+-
+Profiling group to measure the traffic from local CPU to the different
+DRAM NUMA nodes. This group allows to detect NUMA problems in a threaded
+code. You must first determine on which memory domains your code is running.
+A code should only have significant traffic to its own memory domain.
+
+
diff --git a/collectors/likwid/groups/kabini/TLB.txt b/collectors/likwid/groups/kabini/TLB.txt
new file mode 100644
index 0000000..f66b3cb
--- /dev/null
+++ b/collectors/likwid/groups/kabini/TLB.txt
@@ -0,0 +1,34 @@
+SHORT  TLB miss rate/ratio
+
+EVENTSET
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  DATA_CACHE_ACCESSES
+PMC2  L2_DTLB_HIT_ALL
+PMC3  DTLB_MISS_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+L1 DTLB request rate  PMC1/PMC0
+L1 DTLB miss rate   (PMC2+PMC3)/PMC0
+L1 DTLB miss ratio   (PMC2+PMC3)/PMC1
+L2 DTLB request rate   (PMC2+PMC3)/PMC0
+L2 DTLB miss rate    PMC3/PMC0
+L2 DTLB miss ratio    PMC3/(PMC2+PMC3)
+
+
+LONG
+Formulas:
+L1 DTLB request rate = DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS
+L1 DTLB miss rate = (L2_DTLB_HIT_ALL+DTLB_MISS_ALL)/RETIRED_INSTRUCTIONS
+L1 DTLB miss ratio = (L2_DTLB_HIT_ALL+DTLB_MISS_ALL)/DATA_CACHE_ACCESSES
+L2 DTLB request rate = (L2_DTLB_HIT_ALL+DTLB_MISS_ALL)/RETIRED_INSTRUCTIONS
+L2 DTLB miss rate = DTLB_MISS_ALL / RETIRED_INSTRUCTIONS
+L2 DTLB miss ratio = DTLB_MISS_ALL / (L2_DTLB_HIT_ALL+DTLB_MISS_ALL)
+-
+L1 DTLB request  rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The DTLB miss  rate gives a measure how often a TLB miss occurred
+per instruction. And finally L1 DTLB  miss ratio tells you how many
+of your memory references required caused a TLB miss on average.
+NOTE: The L2 metrics are only relevant if L2 DTLB request rate is
+equal to the L1 DTLB miss rate!
diff --git a/collectors/likwid/groups/knl/BRANCH.txt b/collectors/likwid/groups/knl/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/collectors/likwid/groups/knl/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/knl/CLOCK.txt b/collectors/likwid/groups/knl/CLOCK.txt
new file mode 100644
index 0000000..8756ef2
--- /dev/null
+++ b/collectors/likwid/groups/knl/CLOCK.txt
@@ -0,0 +1,23 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+
+LONG
+Formulas:
+Power =  PWR_PKG_ENERGY / time
+-
+The Xeon Phi (Knights Landing) implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/collectors/likwid/groups/knl/DATA.txt b/collectors/likwid/groups/knl/DATA.txt
new file mode 100644
index 0000000..61a915b
--- /dev/null
+++ b/collectors/likwid/groups/knl/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOPS_RETIRED_ALL_LOADS
+PMC1  MEM_UOPS_RETIRED_ALL_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOPS_RETIRED_ALL_LOADS/MEM_UOPS_RETIRED_ALL_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/knl/DIVIDE.txt b/collectors/likwid/groups/knl/DIVIDE.txt
new file mode 100644
index 0000000..d9b0918
--- /dev/null
+++ b/collectors/likwid/groups/knl/DIVIDE.txt
@@ -0,0 +1,24 @@
+SHORT Divide unit information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLES_DIV_BUSY_COUNT
+PMC1 CYCLES_DIV_BUSY
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Number of divide ops PMC0
+Avg. divide unit usage duration PMC1/PMC0
+
+LONG
+Formulas:
+Number of divide ops = CYCLES_DIV_BUSY_COUNT
+Avg. divide unit usage duration = CYCLES_DIV_BUSY/CYCLES_DIV_BUSY_COUNT
+-
+This performance group measures the average latency of divide operations
diff --git a/collectors/likwid/groups/knl/ENERGY.txt b/collectors/likwid/groups/knl/ENERGY.txt
new file mode 100644
index 0000000..19ede75
--- /dev/null
+++ b/collectors/likwid/groups/knl/ENERGY.txt
@@ -0,0 +1,33 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formulas:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+Knights Landing implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/collectors/likwid/groups/knl/FLOPS_DP.txt b/collectors/likwid/groups/knl/FLOPS_DP.txt
new file mode 100644
index 0000000..88bffe2
--- /dev/null
+++ b/collectors/likwid/groups/knl/FLOPS_DP.txt
@@ -0,0 +1,34 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_RETIRED_SCALAR_SIMD
+PMC1  UOPS_RETIRED_PACKED_SIMD
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+DP [MFLOP/s] (SSE assumed) 1.0E-06*((PMC1*2.0)+PMC0)/time
+DP [MFLOP/s] (AVX assumed) 1.0E-06*((PMC1*4.0)+PMC0)/time
+DP [MFLOP/s] (AVX512 assumed) 1.0E-06*((PMC1*8.0)+PMC0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC1)/time
+Scalar [MUOPS/s] 1.0E-06*PMC0/time
+
+LONG
+Formulas:
+DP [MFLOP/s] (SSE assumed) = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD*2+UOPS_RETIRED_SCALAR_SIMD)/runtime
+DP [MFLOP/s] (AVX assumed) = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD*4+UOPS_RETIRED_SCALAR_SIMD)/runtime
+DP [MFLOP/s] (AVX512 assumed) = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD*8+UOPS_RETIRED_SCALAR_SIMD)/runtime
+Packed [MUOPS/s] = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD)/runtime
+Scalar [MUOPS/s] = 1.0E-06*UOPS_RETIRED_SCALAR_SIMD/runtime
+-
+AVX/SSE scalar and packed double precision FLOP rates. The Xeon Phi (Knights Landing) provides
+no possibility to differentiate between double and single precision FLOP/s. Therefore, we only
+assume that the printed [MFLOP/s] value is for double-precision code. Moreover, there is no way
+to distinguish between SSE, AVX or AVX512 packed SIMD operations. Therefore, this group prints
+out the [MFLOP/s] for different SIMD techniques.
+WARNING: The events also count for integer arithmetics
diff --git a/collectors/likwid/groups/knl/FLOPS_SP.txt b/collectors/likwid/groups/knl/FLOPS_SP.txt
new file mode 100644
index 0000000..4a28116
--- /dev/null
+++ b/collectors/likwid/groups/knl/FLOPS_SP.txt
@@ -0,0 +1,34 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_RETIRED_SCALAR_SIMD
+PMC1  UOPS_RETIRED_PACKED_SIMD
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+SP [MFLOP/s] (SSE assumed) 1.0E-06*(PMC1*4.0+PMC0)/time
+SP [MFLOP/s] (AVX assumed) 1.0E-06*(PMC1*8.0+PMC0)/time
+SP [MFLOP/s] (AVX512 assumed) 1.0E-06*(PMC1*16.0+PMC0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC1)/time
+Scalar [MUOPS/s] 1.0E-06*PMC0/time
+
+LONG
+Formulas:
+SP [MFLOP/s] (SSE assumed) = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD*4+UOPS_RETIRED_SCALAR_SIMD)/runtime
+SP [MFLOP/s] (AVX assumed) = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD*8+UOPS_RETIRED_SCALAR_SIMD)/runtime
+SP [MFLOP/s] (AVX512 assumed) = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD*16+UOPS_RETIRED_SCALAR_SIMD)/runtime
+Packed [MUOPS/s] = 1.0E-06*(UOPS_RETIRED_PACKED_SIMD)/runtime
+Scalar [MUOPS/s] = 1.0E-06*UOPS_RETIRED_SCALAR_SIMD/runtime
+-
+AVX/SSE scalar and packed single precision FLOP rates. The Xeon Phi (Knights Landing) provides
+no possibility to differentiate between double and single precision FLOP/s. Therefore, we only
+assume that the printed MFLOP/s value is for single-precision code. Moreover, there is no way
+to distinguish between SSE, AVX or AVX512 packed SIMD operations. Therefore, this group prints
+out the MFLOP/s for different SIMD techniques.
+WARNING: The events also count for integer arithmetics
diff --git a/collectors/likwid/groups/knl/FRONTEND_STALLS.txt b/collectors/likwid/groups/knl/FRONTEND_STALLS.txt
new file mode 100644
index 0000000..1b9f98e
--- /dev/null
+++ b/collectors/likwid/groups/knl/FRONTEND_STALLS.txt
@@ -0,0 +1,25 @@
+SHORT Frontend stalls 
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  NO_ALLOC_CYCLES_ALL
+PMC1  NO_ALLOC_CYCLES_ALL_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Frontend stalls PMC1
+Avg. frontend stall duration [cyc] PMC0/PMC1
+Frontend stall ratio PMC0/FIXC1
+
+LONG
+Formulas:
+Frontend stalls = NO_ALLOC_CYCLES_ALL
+Avg. frontend stall duration [cyc] = NO_ALLOC_CYCLES_ALL/NO_ALLOC_CYCLES_ALL_COUNT
+Frontend stall ratio = NO_ALLOC_CYCLES_ALL/CPU_CLK_UNHALTED_CORE
+-
+Frontend stalls
diff --git a/collectors/likwid/groups/knl/HBM.txt b/collectors/likwid/groups/knl/HBM.txt
new file mode 100644
index 0000000..ac44418
--- /dev/null
+++ b/collectors/likwid/groups/knl/HBM.txt
@@ -0,0 +1,46 @@
+SHORT Memory bandwidth in MBytes/s for High Bandwidth Memory (HBM)
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+EDBOX0C0 EDC_RPQ_INSERTS
+EDBOX1C0 EDC_RPQ_INSERTS
+EDBOX2C0 EDC_RPQ_INSERTS
+EDBOX3C0 EDC_RPQ_INSERTS
+EDBOX4C0 EDC_RPQ_INSERTS
+EDBOX5C0 EDC_RPQ_INSERTS
+EDBOX6C0 EDC_RPQ_INSERTS
+EDBOX7C0 EDC_RPQ_INSERTS
+EDBOX0C1 EDC_WPQ_INSERTS
+EDBOX1C1 EDC_WPQ_INSERTS
+EDBOX2C1 EDC_WPQ_INSERTS
+EDBOX3C1 EDC_WPQ_INSERTS
+EDBOX4C1 EDC_WPQ_INSERTS
+EDBOX5C1 EDC_WPQ_INSERTS
+EDBOX6C1 EDC_WPQ_INSERTS
+EDBOX7C1 EDC_WPQ_INSERTS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(EDBOX0C0+EDBOX1C0+EDBOX2C0+EDBOX3C0+EDBOX4C0+EDBOX5C0+EDBOX6C0+EDBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(EDBOX0C0+EDBOX1C0+EDBOX2C0+EDBOX3C0+EDBOX4C0+EDBOX5C0+EDBOX6C0+EDBOX7C0)*64.0
+Memory writeback bandwidth [MBytes/s] 1.0E-06*(EDBOX0C1+EDBOX1C1+EDBOX2C1+EDBOX3C1+EDBOX4C1+EDBOX5C1+EDBOX6C1+EDBOX7C1)*64.0/time
+Memory writeback data volume [GBytes] 1.0E-09*(EDBOX0C1+EDBOX1C1+EDBOX2C1+EDBOX3C1+EDBOX4C1+EDBOX5C1+EDBOX6C1+EDBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(EDBOX0C0+EDBOX1C0+EDBOX2C0+EDBOX3C0+EDBOX4C0+EDBOX5C0+EDBOX6C0+EDBOX7C0+EDBOX0C1+EDBOX1C1+EDBOX2C1+EDBOX3C1+EDBOX4C1+EDBOX5C1+EDBOX6C1+EDBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(EDBOX0C0+EDBOX1C0+EDBOX2C0+EDBOX3C0+EDBOX4C0+EDBOX5C0+EDBOX6C0+EDBOX7C0+EDBOX0C1+EDBOX1C1+EDBOX2C1+EDBOX3C1+EDBOX4C1+EDBOX5C1+EDBOX6C1+EDBOX7C1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(sum(EDC_RPQ_INSERTS))*64/time
+Memory read data volume [GBytes] = 1.0E-09*(sum(EDC_RPQ_INSERTS))*64
+Memory writeback bandwidth [MBytes/s] = 1.0E-06*(sum(EDC_WPQ_INSERTS))*64/time
+Memory writeback data volume [GBytes] = 1.0E-09*(sum(EDC_WPQ_INSERTS))*64
+Memory bandwidth [MBytes/s] = 1.0E-06*(sum(EDC_RPQ_INSERTS)+sum(EDC_WPQ_INSERTS))*64/time
+Memory data volume [GBytes] = 1.0E-09*(sum(EDC_RPQ_INSERTS)+sum(EDC_WPQ_INSERTS))*64
+-
+Profiling group to measure data transfers from and to the high bandwidth memory (HBM).
+
diff --git a/collectors/likwid/groups/knl/HBM_CACHE.txt b/collectors/likwid/groups/knl/HBM_CACHE.txt
new file mode 100644
index 0000000..f89af5d
--- /dev/null
+++ b/collectors/likwid/groups/knl/HBM_CACHE.txt
@@ -0,0 +1,87 @@
+SHORT Memory bandwidth in MBytes/s for High Bandwidth Memory (HBM)
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+EDBOX0C0 EDC_RPQ_INSERTS
+EDBOX1C0 EDC_RPQ_INSERTS
+EDBOX2C0 EDC_RPQ_INSERTS
+EDBOX3C0 EDC_RPQ_INSERTS
+EDBOX4C0 EDC_RPQ_INSERTS
+EDBOX5C0 EDC_RPQ_INSERTS
+EDBOX6C0 EDC_RPQ_INSERTS
+EDBOX7C0 EDC_RPQ_INSERTS
+EDBOX0C1 EDC_WPQ_INSERTS
+EDBOX1C1 EDC_WPQ_INSERTS
+EDBOX2C1 EDC_WPQ_INSERTS
+EDBOX3C1 EDC_WPQ_INSERTS
+EDBOX4C1 EDC_WPQ_INSERTS
+EDBOX5C1 EDC_WPQ_INSERTS
+EDBOX6C1 EDC_WPQ_INSERTS
+EDBOX7C1 EDC_WPQ_INSERTS
+EUBOX0C0 EDC_MISS_CLEAN
+EUBOX1C0 EDC_MISS_CLEAN
+EUBOX2C0 EDC_MISS_CLEAN
+EUBOX3C0 EDC_MISS_CLEAN
+EUBOX4C0 EDC_MISS_CLEAN
+EUBOX5C0 EDC_MISS_CLEAN
+EUBOX6C0 EDC_MISS_CLEAN
+EUBOX7C0 EDC_MISS_CLEAN
+EUBOX0C1 EDC_MISS_DIRTY
+EUBOX1C1 EDC_MISS_DIRTY
+EUBOX2C1 EDC_MISS_DIRTY
+EUBOX3C1 EDC_MISS_DIRTY
+EUBOX4C1 EDC_MISS_DIRTY
+EUBOX5C1 EDC_MISS_DIRTY
+EUBOX6C1 EDC_MISS_DIRTY
+EUBOX7C1 EDC_MISS_DIRTY
+MBOX0C0 MC_CAS_READS
+MBOX0C1 MC_CAS_WRITES
+MBOX1C0 MC_CAS_READS
+MBOX1C1 MC_CAS_WRITES
+MBOX2C0 MC_CAS_READS
+MBOX2C1 MC_CAS_WRITES
+MBOX4C0 MC_CAS_READS
+MBOX4C1 MC_CAS_WRITES
+MBOX5C0 MC_CAS_READS
+MBOX5C1 MC_CAS_WRITES
+MBOX6C0 MC_CAS_READS
+MBOX6C1 MC_CAS_WRITES
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+MCDRAM Memory read bandwidth [MBytes/s] 1.0E-06*((EDBOX0C0+EDBOX1C0+EDBOX2C0+EDBOX3C0+EDBOX4C0+EDBOX5C0+EDBOX6C0+EDBOX7C0)-(EUBOX0C0+EUBOX1C0+EUBOX2C0+EUBOX3C0+EUBOX4C0+EUBOX5C0+EUBOX6C0+EUBOX7C0)-(EUBOX0C1+EUBOX1C1+EUBOX2C1+EUBOX3C1+EUBOX4C1+EUBOX5C1+EUBOX6C1+EUBOX7C1))*64.0/time
+MCDRAM Memory read data volume [GBytes] 1.0E-09*((EDBOX0C0+EDBOX1C0+EDBOX2C0+EDBOX3C0+EDBOX4C0+EDBOX5C0+EDBOX6C0+EDBOX7C0)-(EUBOX0C0+EUBOX1C0+EUBOX2C0+EUBOX3C0+EUBOX4C0+EUBOX5C0+EUBOX6C0+EUBOX7C0)-(EUBOX0C1+EUBOX1C1+EUBOX2C1+EUBOX3C1+EUBOX4C1+EUBOX5C1+EUBOX6C1+EUBOX7C1))*64.0
+MCDRAM Memory writeback bandwidth [MBytes/s] 1.0E-06*((EDBOX0C1+EDBOX1C1+EDBOX2C1+EDBOX3C1+EDBOX4C1+EDBOX5C1+EDBOX6C1+EDBOX7C1)-(MBOX0C0+MBOX1C0+MBOX2C0+MBOX4C0+MBOX5C0+MBOX6C0))*64.0/time
+MCDRAM Memory writeback data volume [GBytes] 1.0E-09*((EDBOX0C1+EDBOX1C1+EDBOX2C1+EDBOX3C1+EDBOX4C1+EDBOX5C1+EDBOX6C1+EDBOX7C1)-(MBOX0C0+MBOX1C0+MBOX2C0+MBOX4C0+MBOX5C0+MBOX6C0))*64.0
+MCDRAM Memory bandwidth [MBytes/s] 1.0E-06*(((EDBOX0C0+EDBOX1C0+EDBOX2C0+EDBOX3C0+EDBOX4C0+EDBOX5C0+EDBOX6C0+EDBOX7C0)-(EUBOX0C0+EUBOX1C0+EUBOX2C0+EUBOX3C0+EUBOX4C0+EUBOX5C0+EUBOX6C0+EUBOX7C0)-(EUBOX0C1+EUBOX1C1+EUBOX2C1+EUBOX3C1+EUBOX4C1+EUBOX5C1+EUBOX6C1+EUBOX7C1))+((EDBOX0C1+EDBOX1C1+EDBOX2C1+EDBOX3C1+EDBOX4C1+EDBOX5C1+EDBOX6C1+EDBOX7C1)-(MBOX0C0+MBOX1C0+MBOX2C0+MBOX4C0+MBOX5C0+MBOX6C0)))*64.0/time
+MCDRAM Memory data volume [GBytes] 1.0E-09*(((EDBOX0C0+EDBOX1C0+EDBOX2C0+EDBOX3C0+EDBOX4C0+EDBOX5C0+EDBOX6C0+EDBOX7C0)-(EUBOX0C0+EUBOX1C0+EUBOX2C0+EUBOX3C0+EUBOX4C0+EUBOX5C0+EUBOX6C0+EUBOX7C0)-(EUBOX0C1+EUBOX1C1+EUBOX2C1+EUBOX3C1+EUBOX4C1+EUBOX5C1+EUBOX6C1+EUBOX7C1))+((EDBOX0C1+EDBOX1C1+EDBOX2C1+EDBOX3C1+EDBOX4C1+EDBOX5C1+EDBOX6C1+EDBOX7C1)-(MBOX0C0+MBOX1C0+MBOX2C0+MBOX4C0+MBOX5C0+MBOX6C0)))*64.0
+DDR Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX4C0+MBOX5C0+MBOX6C0)*64.0/time
+DDR Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX4C0+MBOX5C0+MBOX6C0)*64.0
+DDR Memory writeback bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX4C1+MBOX5C1+MBOX6C1)*64.0/time
+DDR Memory writeback data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX4C1+MBOX5C1+MBOX6C1)*64.0
+DDR Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX4C1+MBOX5C1+MBOX6C1)*64.0/time
+DDR Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX4C1+MBOX5C1+MBOX6C1)*64.0
+
+
+LONG
+Formulas:
+MCDRAM Memory read bandwidth [MBytes/s] = 1.0E-06*(sum(EDC_RPQ_INSERTS))*64/time
+MCDRAM Memory read data volume [GBytes] = 1.0E-09*(sum(EDC_RPQ_INSERTS))*64
+MCDRAM Memory writeback bandwidth [MBytes/s] = 1.0E-06*(sum(EDC_WPQ_INSERTS))*64/time
+MCDRAM Memory writeback data volume [GBytes] = 1.0E-09*(sum(EDC_WPQ_INSERTS))*64
+MCDRAM Memory bandwidth [MBytes/s] = 1.0E-06*(sum(EDC_RPQ_INSERTS)+sum(EDC_WPQ_INSERTS))*64/time
+MCDRAM Memory data volume [GBytes] = 1.0E-09*(sum(EDC_RPQ_INSERTS)+sum(EDC_WPQ_INSERTS))*64
+DDR Memory read bandwidth [MBytes/s] = 1.0E-06*(sum(MC_CAS_READS))*64/time
+DDR Memory read data volume [GBytes] = 1.0E-09*(sum(MC_CAS_READS))*64
+DDR Memory writeback bandwidth [MBytes/s] = 1.0E-06*(sum(MC_CAS_WRITES))*64/time
+DDR Memory writeback data volume [GBytes] = 1.0E-09*(sum(MC_CAS_WRITES))*64
+DDR Memory bandwidth [MBytes/s] = 1.0E-06*(sum(MC_CAS_READS)+sum(MC_CAS_WRITES))*64/time
+DDR Memory data volume [GBytes] = 1.0E-09*(sum(MC_CAS_READS)+sum(MC_CAS_WRITES))*64
+-
+Profiling group to measure data transfers from and to the high bandwidth memory (HBM).
diff --git a/collectors/likwid/groups/knl/HBM_OFFCORE.txt b/collectors/likwid/groups/knl/HBM_OFFCORE.txt
new file mode 100644
index 0000000..626d08f
--- /dev/null
+++ b/collectors/likwid/groups/knl/HBM_OFFCORE.txt
@@ -0,0 +1,32 @@
+SHORT Memory bandwidth in MBytes/s for High Bandwidth Memory (HBM)
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0:MATCH0=0x4908:MATCH1=0x3F8060 OFFCORE_RESPONSE_0_OPTIONS
+PMC1:MATCH0=0x32F7:MATCH1=0x3F8060 OFFCORE_RESPONSE_1_OPTIONS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(PMC1)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(PMC1)*64.0
+Memory writeback bandwidth [MBytes/s] 1.0E-06*(PMC0)*64.0/time
+Memory writeback data volume [GBytes] 1.0E-09*(PMC0)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(sum(OFFCORE_RESPONSE_1_OPTIONS:MATCH0=0x32F7:MATCH1=0x3F8060))*64/time
+Memory read data volume [GBytes] = 1.0E-09*(sum(OFFCORE_RESPONSE_1_OPTIONS:MATCH0=0x32F7:MATCH1=0x3F8060))*64
+Memory writeback bandwidth [MBytes/s] = 1.0E-06*(sum(OFFCORE_RESPONSE_0_OPTIONS:MATCH0=0x4908:MATCH1=0x3F8060))*64/time
+Memory writeback data volume [GBytes] = 1.0E-09*(sum(OFFCORE_RESPONSE_0_OPTIONS:MATCH0=0x4908:MATCH1=0x3F8060))*64
+Memory bandwidth [MBytes/s] = 1.0E-06*(sum(OFFCORE_RESPONSE_1_OPTIONS:MATCH0=0x32F7:MATCH1=0x3F8060)+sum(OFFCORE_RESPONSE_0_OPTIONS:MATCH0=0x4908:MATCH1=0x3F8060))*64/time
+Memory data volume [GBytes] = 1.0E-09*(sum(OFFCORE_RESPONSE_1_OPTIONS:MATCH0=0x32F7:MATCH1=0x3F8060)+sum(OFFCORE_RESPONSE_0_OPTIONS:MATCH0=0x4908:MATCH1=0x3F8060))*64
+-
+Profiling group to measure data transfers from and to the high bandwidth memory (HBM).
+If possible, use the HBM or HBM_CACHE group because they provide more accurate counts.
diff --git a/collectors/likwid/groups/knl/ICACHE.txt b/collectors/likwid/groups/knl/ICACHE.txt
new file mode 100644
index 0000000..5f11ad6
--- /dev/null
+++ b/collectors/likwid/groups/knl/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+-
+This group measures some L1 instruction cache metrics.
diff --git a/collectors/likwid/groups/knl/L2.txt b/collectors/likwid/groups/knl/L2.txt
new file mode 100644
index 0000000..4a9370c
--- /dev/null
+++ b/collectors/likwid/groups/knl/L2.txt
@@ -0,0 +1,36 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_REQUESTS_REFERENCE
+PMC1:MATCH0=0x0002:MATCH1=0x1 OFFCORE_RESPONSE_0_OPTIONS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 non-RFO bandwidth [MBytes/s] 1.E-06*(PMC0)*64.0/time
+L2 non-RFO data volume [GByte] 1.E-09*PMC0*64.0
+L2 RFO bandwidth [MBytes/s] 1.E-06*(PMC1)*64.0/time
+L2 RFO data volume [GByte] 1.E-09*(PMC1)*64.0
+L2 bandwidth [MBytes/s] 1.E-06*(PMC0+PMC1)*64.0/time
+L2 data volume [GByte] 1.E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L2 non-RFO bandwidth [MBytes/s] = 1.E-06*L2_REQUESTS_REFERENCE*64.0/time
+L2 non-RFO data volume [GByte] = 1.E-09*L2_REQUESTS_REFERENCE*64.0
+L2 RFO bandwidth [MBytes/s] = 1.E-06*(OFFCORE_RESPONSE_0_OPTIONS:MATCH0=0x0002:MATCH1=0x1)*64.0/time
+L2 RFO data volume [GByte] = 1.E-09*(OFFCORE_RESPONSE_0_OPTIONS:MATCH0=0x0002:MATCH1=0x1)*64.0
+L2 bandwidth [MBytes/s] = 1.E-06*(L2_REQUESTS_REFERENCE+OFFCORE_RESPONSE_0_OPTIONS:MATCH0=0x0002:MATCH1=0x1)*64.0/time
+L2 data volume [GByte] = 1.E-09*(L2_REQUESTS_REFERENCE+OFFCORE_RESPONSE_0_OPTIONS:MATCH0=0x0002:MATCH1=0x1)*64.0
+--
+The L2 bandwidth and data volume does not contain RFOs (also called
+write-allocates). The RFO bandwidth and data volume is only accurate when all
+used data fits in the L2 cache. As soon as the data exceeds the L2 cache size,
+the RFO metrics are too high.
+Moreover, with increasing count of measured cores, the non-RFO metrics overcount
+but commonly stay withing 10% error.
diff --git a/collectors/likwid/groups/knl/L2CACHE.txt b/collectors/likwid/groups/knl/L2CACHE.txt
new file mode 100644
index 0000000..e6de92f
--- /dev/null
+++ b/collectors/likwid/groups/knl/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOPS_RETIRED_L2_HIT_LOADS
+PMC1  MEM_UOPS_RETIRED_L2_MISS_LOADS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate (PMC0+PMC1)/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/(PMC0+PMC1)
+
+LONG
+Formulas:
+L2 request rate = (MEM_UOPS_RETIRED_L2_HIT_LOADS+MEM_UOPS_RETIRED_L2_MISS_LOADS)/INSTR_RETIRED_ANY
+L2 miss rate = MEM_UOPS_RETIRED_L2_MISS_LOADS/INSTR_RETIRED_ANY
+L2 miss ratio = MEM_UOPS_RETIRED_L2_MISS_LOADS/(MEM_UOPS_RETIRED_L2_HIT_LOADS+MEM_UOPS_RETIRED_L2_MISS_LOADS)
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache
+reuse.
+
diff --git a/collectors/likwid/groups/knl/MEM.txt b/collectors/likwid/groups/knl/MEM.txt
new file mode 100644
index 0000000..0e53431
--- /dev/null
+++ b/collectors/likwid/groups/knl/MEM.txt
@@ -0,0 +1,47 @@
+SHORT Memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+MBOX0C0 MC_CAS_READS
+MBOX0C1 MC_CAS_WRITES
+MBOX1C0 MC_CAS_READS
+MBOX1C1 MC_CAS_WRITES
+MBOX2C0 MC_CAS_READS
+MBOX2C1 MC_CAS_WRITES
+MBOX4C0 MC_CAS_READS
+MBOX4C1 MC_CAS_WRITES
+MBOX5C0 MC_CAS_READS
+MBOX5C1 MC_CAS_WRITES
+MBOX6C0 MC_CAS_READS
+MBOX6C1 MC_CAS_WRITES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX4C0+MBOX5C0+MBOX6C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX4C0+MBOX5C0+MBOX6C0)*64.0
+Memory writeback bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX4C1+MBOX5C1+MBOX6C1)*64.0/time
+Memory writeback data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX4C1+MBOX5C1+MBOX6C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX4C1+MBOX5C1+MBOX6C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX4C1+MBOX5C1+MBOX6C1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(sum(MC_CAS_READS))*64/time
+Memory read data volume [GBytes] = 1.0E-09*(sum(MC_CAS_READS))*64
+Memory writeback bandwidth [MBytes/s] = 1.0E-06*(sum(MC_CAS_WRITES))*64/time
+Memory writeback data volume [GBytes] = 1.0E-09*(sum(MC_CAS_WRITES))*64
+Memory bandwidth [MBytes/s] = 1.0E-06*(sum(MC_CAS_READS)+sum(MC_CAS_WRITES))*64/time
+Memory data volume [GBytes] = 1.0E-09*(sum(MC_CAS_READS)+sum(MC_CAS_WRITES))*64
+-
+Profiling group to measure L2 to MEM load cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 cache. Since there is no possibility to retrieve
+the evicted cache lines, this group measures only the load cache bandwidth. The
+writeback metrics count only modified cache lines that are written back to go to
+exclusive state
+The group also output totally load and writeback data volume transferred between memory and L2.
+
diff --git a/collectors/likwid/groups/knl/TLB_DATA.txt b/collectors/likwid/groups/knl/TLB_DATA.txt
new file mode 100644
index 0000000..5f2617f
--- /dev/null
+++ b/collectors/likwid/groups/knl/TLB_DATA.txt
@@ -0,0 +1,27 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  PAGE_WALKS_DTLB_COUNT
+PMC1  PAGE_WALKS_DTLB_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB misses     PMC0
+L1 DTLB miss rate  PMC0/FIXC0
+L1 DTLB miss duration [Cyc] PMC1/PMC0
+
+LONG
+Formulas:
+L1 DTLB misses = PAGE_WALKS_DTLB_COUNT
+L1 DTLB miss rate = PAGE_WALKS_DTLB_COUNT / INSTR_RETIRED_ANY
+L1 DTLB miss duration [Cyc] = PAGE_WALKS_DTLB_CYCLES / PAGE_WALKS_DTLB_COUNT
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/knl/TLB_INSTR.txt b/collectors/likwid/groups/knl/TLB_INSTR.txt
new file mode 100644
index 0000000..f3dd3ec
--- /dev/null
+++ b/collectors/likwid/groups/knl/TLB_INSTR.txt
@@ -0,0 +1,27 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  PAGE_WALKS_ITLB_COUNT
+PMC1  PAGE_WALKS_ITLB_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = PAGE_WALKS_ITLB_COUNT
+L1 ITLB miss rate = PAGE_WALKS_ITLB_COUNT / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = PAGE_WALKS_ITLB_CYCLES / PAGE_WALKS_ITLB_COUNT
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
diff --git a/collectors/likwid/groups/knl/UOPS_STALLS.txt b/collectors/likwid/groups/knl/UOPS_STALLS.txt
new file mode 100644
index 0000000..97cfa99
--- /dev/null
+++ b/collectors/likwid/groups/knl/UOPS_STALLS.txt
@@ -0,0 +1,25 @@
+SHORT UOP retirement stalls 
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_RETIRED_STALLED_CYCLES
+PMC1  UOPS_RETIRED_STALLS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Number of stalls PMC1
+Avg. stall duration [cyc] PMC0/PMC1
+Stall ratio PMC0/FIXC1
+
+LONG
+Formulas:
+Number of stalls = UOPS_RETIRED_STALLS
+Avg. stall duration [cyc] = UOPS_RETIRED_STALLED_CYCLES/UOPS_RETIRED_STALLS
+Stall ratio = UOPS_RETIRED_STALLED_CYCLES/CPU_CLK_UNHALTED_CORE
+-
+This group measures stalls in the UOP retirement. 
diff --git a/collectors/likwid/groups/nehalem/BRANCH.txt b/collectors/likwid/groups/nehalem/BRANCH.txt
new file mode 100644
index 0000000..1ef9f11
--- /dev/null
+++ b/collectors/likwid/groups/nehalem/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ration of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/nehalem/CACHE.txt b/collectors/likwid/groups/nehalem/CACHE.txt
new file mode 100644
index 0000000..6603171
--- /dev/null
+++ b/collectors/likwid/groups/nehalem/CACHE.txt
@@ -0,0 +1,36 @@
+SHORT Data cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPL
+PMC1  L1D_ALL_REF_ANY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+data cache misses PMC0
+data cache request rate PMC1/FIXC0
+data cache miss rate PMC0/FIXC0
+data cache miss ratio PMC0/PMC1
+
+LONG
+Formulas:
+data cache misses = L1D_REPL
+data cache request rate =  L1D_ALL_REF_ANY / INSTR_RETIRED_ANY
+data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY
+data cache miss ratio =  L1D_REPL / L1D_ALL_REF_ANY
+-
+This group measures the locality of your data accesses with regard to the
+L1 cache. Data cache request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The data cache miss rate gives a measure how often it was necessary to get
+cache lines from higher levels of the memory hierarchy. And finally
+data cache miss ratio tells you how many of your memory references required
+a cache line to be loaded from a higher level. While the data cache miss rate
+might be given by your algorithm you should try to get data cache miss ratio
+as low as possible by increasing your cache reuse.
+
diff --git a/collectors/likwid/groups/nehalem/DATA.txt b/collectors/likwid/groups/nehalem/DATA.txt
new file mode 100644
index 0000000..31bba51
--- /dev/null
+++ b/collectors/likwid/groups/nehalem/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_INST_RETIRED_LOADS
+PMC1  MEM_INST_RETIRED_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_INST_RETIRED_LOADS/MEM_INST_RETIRED_STORES
+-
+This is a simple metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/nehalem/DIVIDE.txt b/collectors/likwid/groups/nehalem/DIVIDE.txt
new file mode 100644
index 0000000..6c17295
--- /dev/null
+++ b/collectors/likwid/groups/nehalem/DIVIDE.txt
@@ -0,0 +1,24 @@
+SHORT Divide unit information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 ARITH_NUM_DIV
+PMC1 ARITH_CYCLES_DIV_BUSY
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Number of divide ops PMC0
+Avg. divide unit usage duration PMC1/PMC0
+
+LONG
+Formulas:
+Number of divide ops = ARITH_NUM_DIV
+Avg. divide unit usage duration = ARITH_CYCLES_DIV_BUSY/ARITH_NUM_DIV
+-
+This performance group measures the average latency of divide operations
diff --git a/collectors/likwid/groups/nehalem/FLOPS_DP.txt b/collectors/likwid/groups/nehalem/FLOPS_DP.txt
new file mode 100644
index 0000000..0c2e56c
--- /dev/null
+++ b/collectors/likwid/groups/nehalem/FLOPS_DP.txt
@@ -0,0 +1,35 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR
+PMC2  FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
+PMC3  FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+SP [MUOPS/s] 1.0E-06*PMC2/time
+DP [MUOPS/s] 1.0E-06*PMC3/time
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
+-
+The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done.
+Therefore both single as well as double precision are measured to ensure the correctness
+of the measurements. You can check if your code was vectorized on the number of
+FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR.
+
diff --git a/collectors/likwid/groups/nehalem/FLOPS_SP.txt b/collectors/likwid/groups/nehalem/FLOPS_SP.txt
new file mode 100644
index 0000000..8046cbd
--- /dev/null
+++ b/collectors/likwid/groups/nehalem/FLOPS_SP.txt
@@ -0,0 +1,35 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR
+PMC2  FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
+PMC3  FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+SP [MUOPS/s] 1.0E-06*PMC2/time
+DP [MUOPS/s] 1.0E-06*PMC3/time
+
+LONG
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
+-
+The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done.
+Therefore both single as well as double precision are measured to ensure the correctness
+of the measurements. You can check if your code was vectorized on the number of
+FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR.
+
diff --git a/collectors/likwid/groups/nehalem/FLOPS_X87.txt b/collectors/likwid/groups/nehalem/FLOPS_X87.txt
new file mode 100644
index 0000000..39cd8b4
--- /dev/null
+++ b/collectors/likwid/groups/nehalem/FLOPS_X87.txt
@@ -0,0 +1,21 @@
+SHORT X87 MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  INST_RETIRED_X87
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+X87 [MFLOP/s]  1.0E-06*PMC0/time
+
+LONG
+Formulas:
+X87 [MFLOP/s] = 1.0E-06*INST_RETIRED_X87/runtime
+-
+Profiling group to measure X87 FLOP rate.
+
diff --git a/collectors/likwid/groups/nehalem/ICACHE.txt b/collectors/likwid/groups/nehalem/ICACHE.txt
new file mode 100644
index 0000000..49943ff
--- /dev/null
+++ b/collectors/likwid/groups/nehalem/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1I_READS
+PMC1  L1I_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = L1I_READS / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / L1I_READS
+-
+This group measures some L1 instruction cache metrics.
diff --git a/collectors/likwid/groups/nehalem/L2.txt b/collectors/likwid/groups/nehalem/L2.txt
new file mode 100644
index 0000000..e2715cc
--- /dev/null
+++ b/collectors/likwid/groups/nehalem/L2.txt
@@ -0,0 +1,40 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPL
+PMC1  L1D_M_EVICT
+PMC2  L1I_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is
+computed by the number of cache line allocated in the L1 and the
+number of modified cache lines evicted from the L1. Also reports on
+total data volume transferred between L2 and L1 cache.
+Note that this bandwidth also includes data transfers due to a
+write allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
+
+
diff --git a/collectors/likwid/groups/nehalem/L2CACHE.txt b/collectors/likwid/groups/nehalem/L2CACHE.txt
new file mode 100644
index 0000000..343b263
--- /dev/null
+++ b/collectors/likwid/groups/nehalem/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_RQSTS_REFERENCES
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_RQSTS_REFERENCES/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_RQSTS_REFERENCES
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/nehalem/L3.txt b/collectors/likwid/groups/nehalem/L3.txt
new file mode 100644
index 0000000..70b5f29
--- /dev/null
+++ b/collectors/likwid/groups/nehalem/L3.txt
@@ -0,0 +1,36 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ANY
+PMC1  L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ANY*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ANY*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DEMAND_DIRTY*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. Also reports total data volume between L3 and L2 caches.
+Note that this bandwidth also includes data transfers due to a write allocate
+load on a store miss in L2.
+
diff --git a/collectors/likwid/groups/nehalem/L3CACHE.txt b/collectors/likwid/groups/nehalem/L3CACHE.txt
new file mode 100644
index 0000000..15e00ed
--- /dev/null
+++ b/collectors/likwid/groups/nehalem/L3CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0  INSTR_RETIRED_ANY
+FIXC1  CPU_CLK_UNHALTED_CORE
+FIXC2  CPU_CLK_UNHALTED_REF
+UPMC0  UNC_L3_HITS_ANY
+UPMC1  UNC_L3_MISS_ANY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate   (UPMC0+UPMC1)/FIXC0
+L3 miss rate   UPMC1/FIXC0
+L3 miss ratio  UPMC1/(UPMC0+UPMC1)
+
+LONG
+Formulas:
+L3 request rate = (UNC_L3_HITS_ANY+UNC_L3_MISS_ANY)/INSTR_RETIRED_ANY
+L3 miss rate = UNC_L3_MISS_ANY/INSTR_RETIRED_ANY
+L3 miss ratio = UNC_L3_MISS_ANY/(UNC_L3_HITS_ANY+UNC_L3_MISS_ANY)
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/nehalem/MEM.txt b/collectors/likwid/groups/nehalem/MEM.txt
new file mode 100644
index 0000000..b528670
--- /dev/null
+++ b/collectors/likwid/groups/nehalem/MEM.txt
@@ -0,0 +1,49 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0  INSTR_RETIRED_ANY
+FIXC1  CPU_CLK_UNHALTED_CORE
+FIXC2  CPU_CLK_UNHALTED_REF
+UPMC0  UNC_QMC_NORMAL_READS_ANY
+UPMC1  UNC_QMC_WRITES_FULL_ANY
+UPMC2  UNC_QHL_REQUESTS_REMOTE_READS
+UPMC3  UNC_QHL_REQUESTS_REMOTE_WRITES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*UPMC0*64.0/time
+Memory read data volume [GBytes] 1.0E-09*UPMC0*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*UPMC1*64.0/time
+Memory write data volume [GBytes] 1.0E-09*UPMC1*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64.0
+Remote memory read bandwidth [MBytes/s] 1.0E-06*UPMC2*64.0/time
+Remote memory read data volume [GBytes] 1.0E-09*UPMC2*64.0
+Remote memory write bandwidth [MBytes/s] 1.0E-06*UPMC3*64.0/time
+Remote memory write data volume [GBytes] 1.0E-09*UPMC3*64.0
+Remote memory bandwidth [MBytes/s] 1.0E-06*(UPMC2+UPMC3)*64.0/time
+Remote memory data volume [GBytes] 1.0E-09*(UPMC2+UPMC3)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_NORMAL_READS_ANY*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*UNC_QMC_NORMAL_READS_ANY*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_WRITES_FULL_ANY*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*UNC_QMC_WRITES_FULL_ANY*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0
+Remote memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_READS*64.0/time
+Remote memory read data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_READS*64.0
+Remote memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0/time
+Remote memory write data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0
+Remote memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0/time
+Remote memory data volume [GBytes] = 1.0E-09*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+This group will be measured by one core per socket. The Remote Read BW tells
+you if cache lines are transferred between sockets, meaning that cores access
+data owned by a remote NUMA domain.
+
diff --git a/collectors/likwid/groups/nehalem/SCHEDULER.txt b/collectors/likwid/groups/nehalem/SCHEDULER.txt
new file mode 100644
index 0000000..0e43cce
--- /dev/null
+++ b/collectors/likwid/groups/nehalem/SCHEDULER.txt
@@ -0,0 +1,25 @@
+SHORT Scheduler Ports
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_EXECUTED_PORT0
+PMC1  UOPS_EXECUTED_PORT1
+PMC2  UOPS_EXECUTED_PORT5
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Ratio Port 1  PMC1/PMC0
+Ratio Port 5  PMC2/PMC0
+
+LONG
+Formulas:
+Ratio Port 1 = UOPS_EXECUTED_PORT1/UOPS_EXECUTED_PORT0
+Ratio Port 5 = UOPS_EXECUTED_PORT5/UOPS_EXECUTED_PORT0
+-
+Measures how many instructions were scheduled on which issue port.
+
diff --git a/collectors/likwid/groups/nehalem/TLB.txt b/collectors/likwid/groups/nehalem/TLB.txt
new file mode 100644
index 0000000..c380851
--- /dev/null
+++ b/collectors/likwid/groups/nehalem/TLB.txt
@@ -0,0 +1,30 @@
+SHORT  TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_MISSES_ANY
+PMC1  L1D_ALL_REF_ANY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB request rate PMC1/FIXC0
+L1 DTLB miss rate  PMC0/FIXC0
+L1 DTLB miss ratio  PMC0/PMC1
+
+LONG
+Formulas:
+L1 DTLB request rate =  L1D_ALL_REF_ANY / INSTR_RETIRED_ANY
+DTLB miss rate  = DTLB_MISSES_ANY / INSTR_RETIRED_ANY
+L1 DTLB miss ratio  =   DTLB_MISSES_ANY / L1D_ALL_REF_ANY
+-
+L1 DTLB request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The DTLB miss  rate gives a measure how often a TLB miss occurred
+per instruction. And finally L1 DTLB miss ratio tells you how many
+of your memory references required caused a TLB miss on average.
+
diff --git a/collectors/likwid/groups/nehalemEX/BRANCH.txt b/collectors/likwid/groups/nehalemEX/BRANCH.txt
new file mode 100644
index 0000000..1ef9f11
--- /dev/null
+++ b/collectors/likwid/groups/nehalemEX/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ration of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/nehalemEX/CACHE.txt b/collectors/likwid/groups/nehalemEX/CACHE.txt
new file mode 100644
index 0000000..6603171
--- /dev/null
+++ b/collectors/likwid/groups/nehalemEX/CACHE.txt
@@ -0,0 +1,36 @@
+SHORT Data cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPL
+PMC1  L1D_ALL_REF_ANY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+data cache misses PMC0
+data cache request rate PMC1/FIXC0
+data cache miss rate PMC0/FIXC0
+data cache miss ratio PMC0/PMC1
+
+LONG
+Formulas:
+data cache misses = L1D_REPL
+data cache request rate =  L1D_ALL_REF_ANY / INSTR_RETIRED_ANY
+data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY
+data cache miss ratio =  L1D_REPL / L1D_ALL_REF_ANY
+-
+This group measures the locality of your data accesses with regard to the
+L1 cache. Data cache request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The data cache miss rate gives a measure how often it was necessary to get
+cache lines from higher levels of the memory hierarchy. And finally
+data cache miss ratio tells you how many of your memory references required
+a cache line to be loaded from a higher level. While the data cache miss rate
+might be given by your algorithm you should try to get data cache miss ratio
+as low as possible by increasing your cache reuse.
+
diff --git a/collectors/likwid/groups/nehalemEX/DATA.txt b/collectors/likwid/groups/nehalemEX/DATA.txt
new file mode 100644
index 0000000..31bba51
--- /dev/null
+++ b/collectors/likwid/groups/nehalemEX/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_INST_RETIRED_LOADS
+PMC1  MEM_INST_RETIRED_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_INST_RETIRED_LOADS/MEM_INST_RETIRED_STORES
+-
+This is a simple metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/nehalemEX/DIVIDE.txt b/collectors/likwid/groups/nehalemEX/DIVIDE.txt
new file mode 100644
index 0000000..cb15563
--- /dev/null
+++ b/collectors/likwid/groups/nehalemEX/DIVIDE.txt
@@ -0,0 +1,24 @@
+SHORT Divide unit information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0:EDGEDETECT ARITH_CYCLES_DIV_BUSY
+PMC1 ARITH_CYCLES_DIV_BUSY
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Number of divide ops PMC0:EDGEDETECT
+Avg. divide unit usage duration PMC1/PMC0:EDGEDETECT
+
+LONG
+Formulas:
+Number of divide ops = ARITH_CYCLES_DIV_BUSY:EDGEDETECT
+Avg. divide unit usage duration = ARITH_CYCLES_DIV_BUSY/ARITH_CYCLES_DIV_BUSY:EDGEDETECT
+-
+This performance group measures the average latency of divide operations
diff --git a/collectors/likwid/groups/nehalemEX/FLOPS_DP.txt b/collectors/likwid/groups/nehalemEX/FLOPS_DP.txt
new file mode 100644
index 0000000..0c2e56c
--- /dev/null
+++ b/collectors/likwid/groups/nehalemEX/FLOPS_DP.txt
@@ -0,0 +1,35 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR
+PMC2  FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
+PMC3  FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+SP [MUOPS/s] 1.0E-06*PMC2/time
+DP [MUOPS/s] 1.0E-06*PMC3/time
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
+-
+The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done.
+Therefore both single as well as double precision are measured to ensure the correctness
+of the measurements. You can check if your code was vectorized on the number of
+FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR.
+
diff --git a/collectors/likwid/groups/nehalemEX/FLOPS_SP.txt b/collectors/likwid/groups/nehalemEX/FLOPS_SP.txt
new file mode 100644
index 0000000..8046cbd
--- /dev/null
+++ b/collectors/likwid/groups/nehalemEX/FLOPS_SP.txt
@@ -0,0 +1,35 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR
+PMC2  FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
+PMC3  FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+SP [MUOPS/s] 1.0E-06*PMC2/time
+DP [MUOPS/s] 1.0E-06*PMC3/time
+
+LONG
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
+-
+The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done.
+Therefore both single as well as double precision are measured to ensure the correctness
+of the measurements. You can check if your code was vectorized on the number of
+FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR.
+
diff --git a/collectors/likwid/groups/nehalemEX/FLOPS_X87.txt b/collectors/likwid/groups/nehalemEX/FLOPS_X87.txt
new file mode 100644
index 0000000..39cd8b4
--- /dev/null
+++ b/collectors/likwid/groups/nehalemEX/FLOPS_X87.txt
@@ -0,0 +1,21 @@
+SHORT X87 MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  INST_RETIRED_X87
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+X87 [MFLOP/s]  1.0E-06*PMC0/time
+
+LONG
+Formulas:
+X87 [MFLOP/s] = 1.0E-06*INST_RETIRED_X87/runtime
+-
+Profiling group to measure X87 FLOP rate.
+
diff --git a/collectors/likwid/groups/nehalemEX/ICACHE.txt b/collectors/likwid/groups/nehalemEX/ICACHE.txt
new file mode 100644
index 0000000..49943ff
--- /dev/null
+++ b/collectors/likwid/groups/nehalemEX/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1I_READS
+PMC1  L1I_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = L1I_READS / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / L1I_READS
+-
+This group measures some L1 instruction cache metrics.
diff --git a/collectors/likwid/groups/nehalemEX/L2.txt b/collectors/likwid/groups/nehalemEX/L2.txt
new file mode 100644
index 0000000..e2715cc
--- /dev/null
+++ b/collectors/likwid/groups/nehalemEX/L2.txt
@@ -0,0 +1,40 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPL
+PMC1  L1D_M_EVICT
+PMC2  L1I_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is
+computed by the number of cache line allocated in the L1 and the
+number of modified cache lines evicted from the L1. Also reports on
+total data volume transferred between L2 and L1 cache.
+Note that this bandwidth also includes data transfers due to a
+write allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
+
+
diff --git a/collectors/likwid/groups/nehalemEX/L2CACHE.txt b/collectors/likwid/groups/nehalemEX/L2CACHE.txt
new file mode 100644
index 0000000..343b263
--- /dev/null
+++ b/collectors/likwid/groups/nehalemEX/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_RQSTS_REFERENCES
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_RQSTS_REFERENCES/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_RQSTS_REFERENCES
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/nehalemEX/L3.txt b/collectors/likwid/groups/nehalemEX/L3.txt
new file mode 100644
index 0000000..51a0811
--- /dev/null
+++ b/collectors/likwid/groups/nehalemEX/L3.txt
@@ -0,0 +1,37 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ANY
+PMC1  L2_LINES_OUT_DEMAND_DIRTY
+PMC2  L2_LINES_OUT_PREFETCH_DIRTY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*(PMC1+PMC2)*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*(PMC1+PMC2)*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ANY*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ANY*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_OUT_DEMAND_DIRTY+L2_LINES_OUT_PREFETCH_DIRTY)*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*(L2_LINES_OUT_DEMAND_DIRTY+L2_LINES_OUT_PREFETCH_DIRTY)*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY+L2_LINES_OUT_PREFETCH_DIRTY)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY+L2_LINES_OUT_PREFETCH_DIRTY)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. Also reports total data volume between L3 and L2 caches.
+Note that this bandwidth also includes data transfers due to a write allocate
+load on a store miss in L2.
+
diff --git a/collectors/likwid/groups/nehalemEX/L3CACHE.txt b/collectors/likwid/groups/nehalemEX/L3CACHE.txt
new file mode 100644
index 0000000..c6b204e
--- /dev/null
+++ b/collectors/likwid/groups/nehalemEX/L3CACHE.txt
@@ -0,0 +1,48 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0  INSTR_RETIRED_ANY
+FIXC1  CPU_CLK_UNHALTED_CORE
+FIXC2  CPU_CLK_UNHALTED_REF
+CBOX0C0 LLC_HITS_ALL
+CBOX0C1 LLC_MISSES_ALL
+CBOX1C0 LLC_HITS_ALL
+CBOX1C1 LLC_MISSES_ALL
+CBOX2C0 LLC_HITS_ALL
+CBOX2C1 LLC_MISSES_ALL
+CBOX3C0 LLC_HITS_ALL
+CBOX3C1 LLC_MISSES_ALL
+CBOX4C0 LLC_HITS_ALL
+CBOX4C1 LLC_MISSES_ALL
+CBOX5C0 LLC_HITS_ALL
+CBOX5C1 LLC_MISSES_ALL
+CBOX6C0 LLC_HITS_ALL
+CBOX6C1 LLC_MISSES_ALL
+CBOX7C0 LLC_HITS_ALL
+CBOX7C1 LLC_MISSES_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate   (CBOX0C0+CBOX0C1+CBOX1C0+CBOX1C1+CBOX2C0+CBOX2C1+CBOX3C0+CBOX3C1+CBOX4C0+CBOX4C1+CBOX5C0+CBOX5C1+CBOX6C0+CBOX6C1+CBOX7C0+CBOX7C1)/FIXC0
+L3 miss rate   (CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)/FIXC0
+L3 miss ratio  (CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)/(CBOX0C0+CBOX0C1+CBOX1C0+CBOX1C1+CBOX2C0+CBOX2C1+CBOX3C0+CBOX3C1+CBOX4C0+CBOX4C1+CBOX5C0+CBOX5C1+CBOX6C0+CBOX6C1+CBOX7C0+CBOX7C1)
+
+LONG
+Formulas:
+L3 request rate = (SUM(LLC_HITS_ALL)+SUM(LLC_MISSES_ALL))/INSTR_RETIRED_ANY
+L3 miss rate = SUM(LLC_MISSES_ALL)/INSTR_RETIRED_ANY
+L3 miss ratio = SUM(LLC_MISSES_ALL)/(SUM(LLC_HITS_ALL)+SUM(LLC_MISSES_ALL))
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/nehalemEX/MEM.txt b/collectors/likwid/groups/nehalemEX/MEM.txt
new file mode 100644
index 0000000..d3d2522
--- /dev/null
+++ b/collectors/likwid/groups/nehalemEX/MEM.txt
@@ -0,0 +1,43 @@
+SHORT Main memory bandwidth
+
+EVENTSET
+FIXC0   INSTR_RETIRED_ANY
+FIXC1   CPU_CLK_UNHALTED_CORE
+FIXC2   CPU_CLK_UNHALTED_REF
+WBOXFIX UNCORE_CLOCK
+MBOX0C0 FVC_EV0_BBOX_CMDS_READS
+MBOX0C1 DRAM_CMD_CAS_WR_OPN
+MBOX1C0 FVC_EV0_BBOX_CMDS_READS
+MBOX1C1 DRAM_CMD_CAS_WR_OPN
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+Uncore Clock [MHz]  1.E-06*(WBOXFIX)/time
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1)*64/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1)*64
+
+LONG
+Formulas:
+Uncore Clock [MHz] = 1.E-06*(UNCORE_CLOCK)/time
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(FVC_EV0_BBOX_CMDS_READS))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(FVC_EV0_BBOX_CMDS_READS))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MDRAM_CMD_CAS_WR_OPN))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(DRAM_CMD_CAS_WR_OPN))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(FVC_EV0_BBOX_CMDS_READS)+SUM(DRAM_CMD_CAS_WR_OPN))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(FVC_EV0_BBOX_CMDS_READS)+SUM(DRAM_CMD_CAS_WR_OPN))*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+On Nehalem EX it is not possible to measure the write operations with the
+FVC_EV0_BBOX_CMDS_WRITES event at the same time as the FVC_EV0_BBOX_CMDS_READS
+because they set contrary bits. The DRAM_CMD_CAS_WR_OPN is an alternative but
+it only measures write operations to open pages, hence writes to closed pages
+are not included here.
+
diff --git a/collectors/likwid/groups/nehalemEX/SCHEDULER.txt b/collectors/likwid/groups/nehalemEX/SCHEDULER.txt
new file mode 100644
index 0000000..0e43cce
--- /dev/null
+++ b/collectors/likwid/groups/nehalemEX/SCHEDULER.txt
@@ -0,0 +1,25 @@
+SHORT Scheduler Ports
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_EXECUTED_PORT0
+PMC1  UOPS_EXECUTED_PORT1
+PMC2  UOPS_EXECUTED_PORT5
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Ratio Port 1  PMC1/PMC0
+Ratio Port 5  PMC2/PMC0
+
+LONG
+Formulas:
+Ratio Port 1 = UOPS_EXECUTED_PORT1/UOPS_EXECUTED_PORT0
+Ratio Port 5 = UOPS_EXECUTED_PORT5/UOPS_EXECUTED_PORT0
+-
+Measures how many instructions were scheduled on which issue port.
+
diff --git a/collectors/likwid/groups/nehalemEX/TLB.txt b/collectors/likwid/groups/nehalemEX/TLB.txt
new file mode 100644
index 0000000..0e358b8
--- /dev/null
+++ b/collectors/likwid/groups/nehalemEX/TLB.txt
@@ -0,0 +1,30 @@
+SHORT  TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_MISSES_ANY
+PMC1  L1D_ALL_REF_ANY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB request rate PMC1/FIXC0
+L1 DTLB miss rate  PMC0/FIXC0
+L1 DTLB miss ratio  PMC0/PMC1
+
+LONG
+Formulas:
+L1 DTLB request rate =  L1D_ALL_REF_ANY / INSTR_RETIRED_ANY
+DTLB miss rate  = DTLB_MISSES_ANY / INSTR_RETIRED_ANY
+L1 DTLB miss ratio  =   DTLB_MISSES_ANY / L1D_ALL_REF_ANY
+-
+L1 DTLB request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The DTLB miss rate gives a measure how often a TLB miss occurred
+per instruction. And finally L1 DTLB  miss ratio tells you how many
+of your memory references required caused a TLB miss on average.
+
diff --git a/collectors/likwid/groups/nvidia_gpu_cc_ge_7/DATA.txt b/collectors/likwid/groups/nvidia_gpu_cc_ge_7/DATA.txt
new file mode 100644
index 0000000..4171640
--- /dev/null
+++ b/collectors/likwid/groups/nvidia_gpu_cc_ge_7/DATA.txt
@@ -0,0 +1,16 @@
+SHORT Load to store ratio
+
+EVENTSET
+GPU0  SMSP_SASS_INST_EXECUTED_OP_GLOBAL_LD_SUM
+GPU1  SMSP_SASS_INST_EXECUTED_OP_GLOBAL_ST_SUM
+
+METRICS
+Runtime (RDTSC) [s] time
+Load to store ratio GPU0/GPU1
+
+LONG
+Formulas:
+Load to store ratio = SMSP_SASS_INST_EXECUTED_OP_GLOBAL_LD_SUM/SMSP_SASS_INST_EXECUTED_OP_GLOBAL_ST_SUM
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/nvidia_gpu_cc_ge_7/FLOPS_DP.txt b/collectors/likwid/groups/nvidia_gpu_cc_ge_7/FLOPS_DP.txt
new file mode 100644
index 0000000..7c6ae6c
--- /dev/null
+++ b/collectors/likwid/groups/nvidia_gpu_cc_ge_7/FLOPS_DP.txt
@@ -0,0 +1,19 @@
+SHORT Double-precision floating point
+
+EVENTSET
+GPU0 SMSP_SASS_THREAD_INST_EXECUTED_OP_DADD_PRED_ON_SUM
+GPU1 SMSP_SASS_THREAD_INST_EXECUTED_OP_DMUL_PRED_ON_SUM
+GPU2 SMSP_SASS_THREAD_INST_EXECUTED_OP_DFMA_PRED_ON_SUM
+
+
+METRICS
+Runtime (RDTSC) [s] time
+DP [MFLOP/s] 1E-6*(GPU0+GPU1+(GPU2*2))/time
+
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1E-6*(SMSP_SASS_THREAD_INST_EXECUTED_OP_DADD_PRED_ON_SUM+SMSP_SASS_THREAD_INST_EXECUTED_OP_DMUL_PRED_ON_SUM+2*SMSP_SASS_THREAD_INST_EXECUTED_OP_DFMA_PRED_ON_SUM)/time
+--
+This group measures the double-precision floating-point operations per second using the events
+SMSP_SASS_THREAD_INST_EXECUTED_OP_D{ADD, MUL, FMA}_PRED_ON_SUM.
diff --git a/collectors/likwid/groups/nvidia_gpu_cc_ge_7/FLOPS_HP.txt b/collectors/likwid/groups/nvidia_gpu_cc_ge_7/FLOPS_HP.txt
new file mode 100644
index 0000000..76c78cd
--- /dev/null
+++ b/collectors/likwid/groups/nvidia_gpu_cc_ge_7/FLOPS_HP.txt
@@ -0,0 +1,19 @@
+SHORT Half-precision floating point
+
+EVENTSET
+GPU0 SMSP_SASS_THREAD_INST_EXECUTED_OP_HADD_PRED_ON_SUM
+GPU1 SMSP_SASS_THREAD_INST_EXECUTED_OP_HMUL_PRED_ON_SUM
+GPU2 SMSP_SASS_THREAD_INST_EXECUTED_OP_HFMA_PRED_ON_SUM
+
+
+METRICS
+Runtime (RDTSC) [s] time
+HP [MFLOP/s] 1E-6*(GPU0+GPU1+(GPU2*2))/time
+
+
+LONG
+Formulas:
+HP [MFLOP/s] = 1E-6*(SMSP_SASS_THREAD_INST_EXECUTED_OP_HADD_PRED_ON_SUM+SMSP_SASS_THREAD_INST_EXECUTED_OP_HMUL_PRED_ON_SUM+2*SMSP_SASS_THREAD_INST_EXECUTED_OP_HFMA_PRED_ON_SUM)/time
+--
+This group measures the half-precision floating-point operations per second using the events
+SMSP_SASS_THREAD_INST_EXECUTED_OP_H{ADD, MUL, FMA}_PRED_ON_SUM.
diff --git a/collectors/likwid/groups/nvidia_gpu_cc_ge_7/FLOPS_SP.txt b/collectors/likwid/groups/nvidia_gpu_cc_ge_7/FLOPS_SP.txt
new file mode 100644
index 0000000..cdc7a4e
--- /dev/null
+++ b/collectors/likwid/groups/nvidia_gpu_cc_ge_7/FLOPS_SP.txt
@@ -0,0 +1,19 @@
+SHORT Single-precision floating point
+
+EVENTSET
+GPU0 SMSP_SASS_THREAD_INST_EXECUTED_OP_FADD_PRED_ON_SUM
+GPU1 SMSP_SASS_THREAD_INST_EXECUTED_OP_FMUL_PRED_ON_SUM
+GPU2 SMSP_SASS_THREAD_INST_EXECUTED_OP_FFMA_PRED_ON_SUM
+
+
+METRICS
+Runtime (RDTSC) [s] time
+SP [MFLOP/s] 1E-6*(GPU0+GPU1+(GPU2*2))/time
+
+
+LONG
+Formulas:
+SP [MFLOP/s] = 1E-6*(SMSP_SASS_THREAD_INST_EXECUTED_OP_FADD_PRED_ON_SUM+SMSP_SASS_THREAD_INST_EXECUTED_OP_FMUL_PRED_ON_SUM+2*SMSP_SASS_THREAD_INST_EXECUTED_OP_FFMA_PRED_ON_SUM)/time
+--
+This group measures the single-precision floating-point operations per second using the events
+SMSP_SASS_THREAD_INST_EXECUTED_OP_F{ADD, MUL, FMA}_PRED_ON_SUM.
diff --git a/collectors/likwid/groups/nvidia_gpu_cc_lt_7/DATA.txt b/collectors/likwid/groups/nvidia_gpu_cc_lt_7/DATA.txt
new file mode 100644
index 0000000..b96bf08
--- /dev/null
+++ b/collectors/likwid/groups/nvidia_gpu_cc_lt_7/DATA.txt
@@ -0,0 +1,20 @@
+SHORT Load to store ratio
+
+EVENTSET
+GPU0  GLOBAL_LOAD
+GPU1  GLOBAL_STORE
+GPU2  INST_EXECUTED
+GPU3  ACTIVE_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  GPU3/GPU2
+Load to store ratio GPU0/GPU1
+
+LONG
+Formulas:
+CPI = ACTIVE_CYCLES/INST_EXECUTED
+Load to store ratio = GENERIC_LOAD/GENERIC_STORE
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/nvidia_gpu_cc_lt_7/FLOPS_DP.txt b/collectors/likwid/groups/nvidia_gpu_cc_lt_7/FLOPS_DP.txt
new file mode 100644
index 0000000..c03ac90
--- /dev/null
+++ b/collectors/likwid/groups/nvidia_gpu_cc_lt_7/FLOPS_DP.txt
@@ -0,0 +1,19 @@
+SHORT Double-precision floating point
+
+EVENTSET
+GPU0 INST_EXECUTED_FP64_PIPE_S0
+GPU1 INST_EXECUTED_FP64_PIPE_S1
+GPU2 INST_EXECUTED_FP64_PIPE_S2
+GPU3 INST_EXECUTED_FP64_PIPE_S3
+
+
+METRICS
+DP [MFLOP/s] 1E-6*(GPU0+GPU1+GPU2+GPU3)/time
+
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1E-6*(SUM(INST_EXECUTED_FP64_PIPE_S*))/time
+--
+This group measures the double precision floating-point operations per second using the events
+INST_EXECUTED_FP64_PIPE_S*.
diff --git a/collectors/likwid/groups/nvidia_gpu_cc_lt_7/FLOPS_SP.txt b/collectors/likwid/groups/nvidia_gpu_cc_lt_7/FLOPS_SP.txt
new file mode 100644
index 0000000..09f58ee
--- /dev/null
+++ b/collectors/likwid/groups/nvidia_gpu_cc_lt_7/FLOPS_SP.txt
@@ -0,0 +1,20 @@
+SHORT Single-precision floating point
+
+EVENTSET
+GPU0 INST_EXECUTED_FP64_PIPE_S0
+GPU1 INST_EXECUTED_FP64_PIPE_S1
+GPU2 INST_EXECUTED_FP64_PIPE_S2
+GPU3 INST_EXECUTED_FP64_PIPE_S3
+
+
+METRICS
+SP [MFLOP/s] 1E-6*(GPU0+GPU1+GPU2+GPU3)/time
+
+
+LONG
+Formulas:
+SP [MFLOP/s] = 1E-6*(SUM(INST_EXECUTED_FP64_PIPE_S*))/time
+--
+This group measures the single precision floating-point operations per second using the events
+INST_EXECUTED_FP64_PIPE_S*. Unfortunately, not all GPUs provide these events although they provide
+a metric for SP FP operations which are currently not usable with LIKWID.
diff --git a/collectors/likwid/groups/pentiumm/BRANCH.txt b/collectors/likwid/groups/pentiumm/BRANCH.txt
new file mode 100644
index 0000000..269a500
--- /dev/null
+++ b/collectors/likwid/groups/pentiumm/BRANCH.txt
@@ -0,0 +1,17 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+PMC0  BR_INST_EXEC
+PMC1  BR_MISSP_EXEC
+
+METRICS
+Runtime (RDTSC) [s] time
+Branch misprediction ratio  PMC1/PMC0
+
+LONG
+Formulas:
+Branch misprediction ratio = BR_MISSP_EXEC / BR_INST_EXEC
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
diff --git a/collectors/likwid/groups/pentiumm/CPI.txt b/collectors/likwid/groups/pentiumm/CPI.txt
new file mode 100644
index 0000000..ae4aa26
--- /dev/null
+++ b/collectors/likwid/groups/pentiumm/CPI.txt
@@ -0,0 +1,22 @@
+SHORT  Cycles per instruction
+
+EVENTSET
+PMC0  UOPS_RETIRED
+PMC1  CPU_CLK_UNHALTED
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI   PMC1/PMC0
+IPC   PMC0/PMC1
+
+LONG
+Formulas:
+CPI  = CPU_CLK_UNHALTED/UOPS_RETIRED
+IPC  = UOPS_RETIRED/CPU_CLK_UNHALTED
+-
+This group measures how efficient the processor works with
+regard to instruction throughput. Also important as a standalone
+metric is UOPS_RETIRED as it tells you how many uops
+you need to execute for a task. An optimization might show very
+low CPI values but execute many more instruction for it.
+
diff --git a/collectors/likwid/groups/pentiumm/FLOPS_DP.txt b/collectors/likwid/groups/pentiumm/FLOPS_DP.txt
new file mode 100644
index 0000000..058a64e
--- /dev/null
+++ b/collectors/likwid/groups/pentiumm/FLOPS_DP.txt
@@ -0,0 +1,20 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+PMC0 EMON_SSE_SSE2_COMP_INST_RETIRED_PACKED_DP
+PMC1 EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_DP
+
+METRICS
+Runtime (RDTSC) [s] time
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+
+LONG
+Formulas:
+DP [MFLOP/s] =  (EMON_SSE_SSE2_COMP_INST_RETIRED_PACKED_DP*2 + EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_DP )/ runtime
+Packed [MUOPS/s] = 1.0E-06*(EMON_SSE_SSE2_COMP_INST_RETIRED_PACKED_DP)/time
+Scalar [MUOPS/s] = 1.0E-06*EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_DP/time
+-
+SSE scalar and packed double precision FLOP rates.
+
diff --git a/collectors/likwid/groups/pentiumm/FLOPS_SP.txt b/collectors/likwid/groups/pentiumm/FLOPS_SP.txt
new file mode 100644
index 0000000..d70b835
--- /dev/null
+++ b/collectors/likwid/groups/pentiumm/FLOPS_SP.txt
@@ -0,0 +1,18 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+PMC0 EMON_SSE_SSE2_COMP_INST_RETIRED_ALL_SP
+PMC1 EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_SP
+
+METRICS
+Runtime (RDTSC) [s] time
+SP [MFLOP/s]  1.0E-06*(PMC0)/time
+Scalar [MUOPS/s] 1.0E-06*(PMC1)/time
+
+LONG
+Formulas:
+SP [MFLOP/s] =  (EMON_SSE_SSE2_COMP_INST_RETIRED_ALL_SP)/ runtime
+Scalar [MUOPS/s] =  (EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_SP)/ runtime
+-
+SSE scalar and packed single precision FLOP rates.
+
diff --git a/collectors/likwid/groups/pentiumm/L3.txt b/collectors/likwid/groups/pentiumm/L3.txt
new file mode 100644
index 0000000..2ed5293
--- /dev/null
+++ b/collectors/likwid/groups/pentiumm/L3.txt
@@ -0,0 +1,30 @@
+SHORT L3 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  L2_LINES_IN_ALL_ALL
+PMC1  L2_LINES_OUT_ALL_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_ALL_ALL*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_ALL_ALL*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL_ALL+L2_LINES_OUT_ALL_ALL)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL_ALL+L2_LINES_OUT_ALL_ALL)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. The group also output total data volume transferred between
+L2. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L2.
+
diff --git a/collectors/likwid/groups/phi/CACHE.txt b/collectors/likwid/groups/phi/CACHE.txt
new file mode 100644
index 0000000..01ac5e4
--- /dev/null
+++ b/collectors/likwid/groups/phi/CACHE.txt
@@ -0,0 +1,22 @@
+SHORT L1 compute to data access ratio
+
+EVENTSET
+PMC0  VPU_ELEMENTS_ACTIVE
+PMC1  DATA_READ_OR_WRITE
+
+METRICS
+Runtime (RDTSC) [s] time
+L1 compute intensity   PMC0/PMC1
+
+LONG
+Formulas:
+L1 compute intensity = VPU_ELEMENTS_ACTIVE/DATA_READ_OR_WRITE
+-
+These metric is a way to measure the computational density of an
+application, or how many computations it is performing on average for each
+piece of data loaded. L1 compute to data access ratio should be
+used to judge suitability of an application for running on the Intel MIC
+architecture. Applications that will perform well on the Intel MIC
+architecture should be vectorized, and ideally be able to perform multiple
+operations on the same pieces of data (or same cache lines).
+
diff --git a/collectors/likwid/groups/phi/COMPUTE_TO_DATA_RATIO.txt b/collectors/likwid/groups/phi/COMPUTE_TO_DATA_RATIO.txt
new file mode 100644
index 0000000..6fdd008
--- /dev/null
+++ b/collectors/likwid/groups/phi/COMPUTE_TO_DATA_RATIO.txt
@@ -0,0 +1,22 @@
+SHORT L2 compute to data access ratio
+
+EVENTSET
+PMC0  VPU_ELEMENTS_ACTIVE
+PMC1  DATA_READ_MISS_OR_WRITE_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+L2 compute intensity   PMC0/PMC1
+
+LONG
+Formulas:
+L2 compute intensity = VPU_ELEMENTS_ACTIVE/DATA_READ_MISS_OR_WRITE_MISS
+-
+These metric is a way to measure the computational density of an
+application, or how many computations it is performing on average for each
+piece of data loaded. L2 compute to data access ratio should be
+used to judge suitability of an application for running on the Intel MIC
+architecture. Applications that will perform well on the Intel MIC
+architecture should be vectorized, and ideally be able to perform multiple
+operations on the same pieces of data (or same cache lines).
+
diff --git a/collectors/likwid/groups/phi/CPI.txt b/collectors/likwid/groups/phi/CPI.txt
new file mode 100644
index 0000000..f3d8b4e
--- /dev/null
+++ b/collectors/likwid/groups/phi/CPI.txt
@@ -0,0 +1,23 @@
+SHORT  Cycles per instruction
+
+EVENTSET
+PMC0  INSTRUCTIONS_EXECUTED
+PMC1  CPU_CLK_UNHALTED
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]  PMC1*inverseClock
+CPI   PMC1/PMC0
+IPC   PMC0/PMC1
+
+LONG
+Formulas:
+CPI = CPU_CLK_UNHALTED/INSTRUCTIONS_EXECUTED
+IPC = INSTRUCTIONS_EXECUTED/CPU_CLK_UNHALTED
+-
+This group measures how efficient the processor works with
+regard to instruction throughput. Also important as a standalone
+metric is INSTRUCTIONS_RETIRED as it tells you how many instruction
+you need to execute for a task. An optimization might show very
+low CPI values but execute many more instruction for it.
+
diff --git a/collectors/likwid/groups/phi/MEM.txt b/collectors/likwid/groups/phi/MEM.txt
new file mode 100644
index 0000000..8899592
--- /dev/null
+++ b/collectors/likwid/groups/phi/MEM.txt
@@ -0,0 +1,18 @@
+SHORT Memory bandwidth
+
+EVENTSET
+PMC0  DATA_READ_MISS_OR_WRITE_MISS
+PMC1  DATA_CACHE_LINES_WRITTEN_BACK
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Memory bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+Memory bandwidth [MBytes/s] = 1.0E-06*(DATA_READ_MISS_OR_WRITE_MISS+DATA_CACHE_LINES_WRITTEN_BACK)*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(DATA_READ_MISS_OR_WRITE_MISS+DATA_CACHE_LINES_WRITTEN_BACK)*64.0
+-
+Total memory bandwidth and data volume.
diff --git a/collectors/likwid/groups/phi/MEM1.txt b/collectors/likwid/groups/phi/MEM1.txt
new file mode 100644
index 0000000..c9f7fb6
--- /dev/null
+++ b/collectors/likwid/groups/phi/MEM1.txt
@@ -0,0 +1,18 @@
+SHORT L2 write misses
+
+EVENTSET
+PMC0  L2_DATA_WRITE_MISS_MEM_FILL
+
+METRICS
+Runtime (RDTSC) [s] time
+L2 RFO bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 RFO data volume [GBytes] 1.0E-09*PMC0*64.0
+
+LONG
+Formulas:
+L2 RFO bandwidth [MBytes/s] = 1.0E-06*L2_DATA_WRITE_MISS_MEM_FILL*64.0/time
+L2 RFO data volume [GBytes] = 1.0E-09*L2_DATA_WRITE_MISS_MEM_FILL*64.0
+-
+Bandwidth and data volume fetched from memory due to a L2 data write miss. These
+fetches are commonly called write-allocate loads or read-for-ownership (RFO).
+
diff --git a/collectors/likwid/groups/phi/MEM2.txt b/collectors/likwid/groups/phi/MEM2.txt
new file mode 100644
index 0000000..d44a823
--- /dev/null
+++ b/collectors/likwid/groups/phi/MEM2.txt
@@ -0,0 +1,17 @@
+SHORT L2 read misses
+
+EVENTSET
+PMC0  L2_DATA_READ_MISS_MEM_FILL
+
+METRICS
+Runtime (RDTSC) [s] time
+L2 read bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 read data volume [GBytes] 1.0E-09*PMC0*64.0
+
+LONG
+Formulas:
+L2 read bandwidth [MBytes/s] = 1.0E-06*L2_DATA_READ_MISS_MEM_FILL*64.0/time
+L2 read data volume [GBytes] = 1.0E-09*L2_DATA_READ_MISS_MEM_FILL*64.0
+-
+The data volume and bandwidth caused by read misses in the L2 cache.
+
diff --git a/collectors/likwid/groups/phi/MEM3.txt b/collectors/likwid/groups/phi/MEM3.txt
new file mode 100644
index 0000000..73de570
--- /dev/null
+++ b/collectors/likwid/groups/phi/MEM3.txt
@@ -0,0 +1,17 @@
+SHORT HW prefetch transfers
+
+EVENTSET
+PMC0  HWP_L2MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Prefetch bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+Prefetch data volume [GBytes] 1.0E-09*PMC0*64.0
+
+LONG
+Formulas:
+Prefetch bandwidth [MBytes/s] = 1.0E-06*HWP_L2MISS*64.0/time
+Prefetch data volume [GBytes] = 1.0E-09*HWP_L2MISS*64.0
+-
+The bandwidth and data volume caused by L2 misses from the hardware prefetcher.
+
diff --git a/collectors/likwid/groups/phi/MEM4.txt b/collectors/likwid/groups/phi/MEM4.txt
new file mode 100644
index 0000000..9e892bd
--- /dev/null
+++ b/collectors/likwid/groups/phi/MEM4.txt
@@ -0,0 +1,17 @@
+SHORT L2 victom requests
+
+EVENTSET
+PMC0  L2_VICTIM_REQ_WITH_DATA
+
+METRICS
+Runtime (RDTSC) [s] time
+Victim bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+Victim data volume [GBytes] 1.0E-09*PMC0*64.0
+
+LONG
+Formulas:
+Victim bandwidth [MBytes/s] = 1.0E-06*L2_VICTIM_REQ_WITH_DATA*64.0/time
+Victim data volume [GBytes] = 1.0E-09*L2_VICTIM_REQ_WITH_DATA*64.0
+-
+Data volume and bandwidth caused by cache line victims.
+
diff --git a/collectors/likwid/groups/phi/MEM5.txt b/collectors/likwid/groups/phi/MEM5.txt
new file mode 100644
index 0000000..49acb98
--- /dev/null
+++ b/collectors/likwid/groups/phi/MEM5.txt
@@ -0,0 +1,19 @@
+SHORT L2 snoop hits
+
+EVENTSET
+PMC0  SNP_HITM_L2
+
+METRICS
+Runtime (RDTSC) [s] time
+Snoop bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+Snoop data volume [GBytes] 1.0E-09*PMC0*64.0
+
+LONG
+Formulas:
+Snoop bandwidth [MBytes/s] = 1.0E-06*SNP_HITM_L2*64.0/time
+Snoop data volume [GBytes] = 1.0E-09*SNP_HITM_L2*64.0
+-
+Snoop traffic caused by HITM requests. HITM requests are L2 requests that
+are served by another core's L2 cache but the remote cache line is in modified
+state.
+
diff --git a/collectors/likwid/groups/phi/MEM6.txt b/collectors/likwid/groups/phi/MEM6.txt
new file mode 100644
index 0000000..835faf8
--- /dev/null
+++ b/collectors/likwid/groups/phi/MEM6.txt
@@ -0,0 +1,17 @@
+SHORT L2 read misses
+
+EVENTSET
+PMC0  L2_READ_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+L2 read bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 read data volume [GBytes] 1.0E-09*PMC0*64.0
+
+LONG
+Formulas:
+L2 read bandwidth [MBytes/s] = 1.0E-06*L2_READ_MISS*64.0/time
+L2 read data volume [GBytes] = 1.0E-09*L2_READ_MISS*64.0
+-
+Data volume and bandwidth caused by read misses in the L2 cache.
+
diff --git a/collectors/likwid/groups/phi/MEM_READ.txt b/collectors/likwid/groups/phi/MEM_READ.txt
new file mode 100644
index 0000000..fb107b0
--- /dev/null
+++ b/collectors/likwid/groups/phi/MEM_READ.txt
@@ -0,0 +1,20 @@
+SHORT Memory read bandwidth
+
+EVENTSET
+PMC0  DATA_READ_MISS
+PMC1  HWP_L2MISS
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Memory read bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(L2_DATA_READ_MISS_MEM_FILL+HWP_L2MISS)*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(L2_DATA_READ_MISS_MEM_FILL+HWP_L2MISS)*64.0
+-
+Bandwidth and data volume of read operations from the memory to L2 cache. The
+metric is introduced in the book 'Intel Xeon Phi Coprocessor High-Performance
+Programming' by James Jeffers and James Reinders.
diff --git a/collectors/likwid/groups/phi/MEM_WRITE.txt b/collectors/likwid/groups/phi/MEM_WRITE.txt
new file mode 100644
index 0000000..01043fd
--- /dev/null
+++ b/collectors/likwid/groups/phi/MEM_WRITE.txt
@@ -0,0 +1,20 @@
+SHORT Memory write bandwidth
+
+EVENTSET
+PMC0  L2_VICTIM_REQ_WITH_DATA
+PMC1  SNP_HITM_L2
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Memory write bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+Memory write bandwidth [MBytes/s] = 1.0E-06*(L2_VICTIM_REQ_WITH_DATA+SNP_HITM_L2)*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(L2_VICTIM_REQ_WITH_DATA+SNP_HITM_L2)*64.0
+-
+Bandwidth and data volume of write operations from the L2 cache to memory. The
+metric is introduced in the book 'Intel Xeon Phi Coprocessor High-Performance
+Programming' by James Jeffers and James Reinders.
diff --git a/collectors/likwid/groups/phi/PAIRING.txt b/collectors/likwid/groups/phi/PAIRING.txt
new file mode 100644
index 0000000..ce3627c
--- /dev/null
+++ b/collectors/likwid/groups/phi/PAIRING.txt
@@ -0,0 +1,21 @@
+SHORT Pairing ratio
+
+EVENTSET
+PMC0  INSTRUCTIONS_EXECUTED
+PMC1  INSTRUCTIONS_EXECUTED_V_PIPE
+
+METRICS
+Runtime (RDTSC) [s] time
+V-pipe ratio   PMC1/PMC0
+Pairing ratio PMC1/(PMC0-PMC1)
+
+LONG
+Formulas:
+V-pipe ratio = INSTRUCTIONS_EXECUTED_V_PIPE/INSTRUCTIONS_EXECUTED
+Pairing ratio = INSTRUCTIONS_EXECUTED_V_PIPE/(INSTRUCTIONS_EXECUTED-INSTRUCTIONS_EXECUTED_V_PIPE)
+-
+Each hardware thread on the Xeon Phi can execute two instruction simultaneously,
+one in the U-pipe and one in the V-pipe. But this is only possible if the
+instructions can be paired. The instructions executed in paired fashion are counted
+by the event INSTRUCTIONS_EXECUTED_V_PIPE. The event INSTRUCTIONS_EXECUTED increments
+for each instruction, hence the maximal increase per cycle can be 2.
diff --git a/collectors/likwid/groups/phi/READ_MISS_RATIO.txt b/collectors/likwid/groups/phi/READ_MISS_RATIO.txt
new file mode 100644
index 0000000..dbdaad5
--- /dev/null
+++ b/collectors/likwid/groups/phi/READ_MISS_RATIO.txt
@@ -0,0 +1,15 @@
+SHORT Miss ratio fof data reads
+
+EVENTSET
+PMC0  DATA_READ
+PMC1  DATA_READ_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Read miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+Read miss ratio = DATA_READ_MISS/DATA_READ
+--
+Miss ratio for data reads.
diff --git a/collectors/likwid/groups/phi/TLB.txt b/collectors/likwid/groups/phi/TLB.txt
new file mode 100644
index 0000000..6f00359
--- /dev/null
+++ b/collectors/likwid/groups/phi/TLB.txt
@@ -0,0 +1,23 @@
+SHORT TLB Misses
+
+EVENTSET
+PMC0 LONG_DATA_PAGE_WALK
+PMC1 DATA_PAGE_WALK
+
+METRICS
+Runtime (RDTSC) [s] time
+L1 TLB misses [misses/s] PMC1/time
+L2 TLB misses [misses/s] PMC0/time
+L1 TLB misses per L2 TLB miss PMC1/PMC0
+
+LONG
+Formulas:
+L1 TLB misses [misses/s] = DATA_PAGE_WALK/time
+L2 TLB misses [misses/s] = LONG_DATA_PAGE_WALK/time
+L1 TLB misses per L2 TLB miss = DATA_PAGE_WALK/LONG_DATA_PAGE_WALK
+-
+Analysis of the layered TLB of the Intel Xeon Phi. According to the book
+'Intel Xeon Phi Coprocessor High-Performance Programming' by James Jeffers and
+James Reinders, a high L1 TLB misses per L2 TLB miss ratio suggests that your
+working set fits into the L2 TLB but not in L1 TLB. Using large pages may be
+beneficial.
diff --git a/collectors/likwid/groups/phi/TLB_L1.txt b/collectors/likwid/groups/phi/TLB_L1.txt
new file mode 100644
index 0000000..d826d04
--- /dev/null
+++ b/collectors/likwid/groups/phi/TLB_L1.txt
@@ -0,0 +1,23 @@
+SHORT L1 TLB misses
+
+EVENTSET
+PMC0 DATA_PAGE_WALK
+PMC1 DATA_READ_OR_WRITE
+
+METRICS
+Runtime (RDTSC) [s] time
+L1 TLB misses [misses/s] PMC0/time
+L1 TLB miss ratio PMC0/PMC1
+
+LONG
+Formulas:
+L1 TLB misses [misses/s] = DATA_PAGE_WALK/time
+L1 TLB miss ratio = DATA_PAGE_WALK/DATA_READ_OR_WRITE
+-
+This performance group measures the L1 TLB misses. A L1 TLB miss that hits the
+L2 TLB has a penelty of about 25 cycles for 4kB pages. For 2MB pages, the penelty
+for a L1 TLB miss that hits L2 TLB is about 8 cycles. The minimal L1 TLB miss ratio
+is about 1/64, so a high ratio indicates a bad spartial locality. Data of a page
+is only partly accessed. It can also indicate trashing because when multiple pages
+are accessed in a loop iteration, the size and associativity is not sufficient to
+hold all pages.
diff --git a/collectors/likwid/groups/phi/TLB_L2.txt b/collectors/likwid/groups/phi/TLB_L2.txt
new file mode 100644
index 0000000..9a95125
--- /dev/null
+++ b/collectors/likwid/groups/phi/TLB_L2.txt
@@ -0,0 +1,21 @@
+SHORT L2 TLB misses
+
+EVENTSET
+PMC0 LONG_DATA_PAGE_WALK
+PMC1 DATA_READ_OR_WRITE
+
+METRICS
+Runtime (RDTSC) [s] time
+L2 TLB misses [misses/s] PMC0/time
+L2 TLB miss ratio PMC0/PMC1
+
+LONG
+Formulas:
+L2 TLB misses [misses/s] = LONG_DATA_PAGE_WALK/time
+L2 TLB miss ratio = LONG_DATA_PAGE_WALK/DATA_READ_OR_WRITE
+-
+This performance group measures the L2 TLB misses. A L2 TLB miss has a penelty
+of at least 100 cycles, hence it is important to avoid them. A high ratio can
+indicate trashing because when multiple pages are accessed in a loop iteration,
+the size and associativity is not sufficient to hold all pages. This would also
+result in a bad ratio for the L1 TLB.
diff --git a/collectors/likwid/groups/phi/VECTOR.txt b/collectors/likwid/groups/phi/VECTOR.txt
new file mode 100644
index 0000000..b6ec6a6
--- /dev/null
+++ b/collectors/likwid/groups/phi/VECTOR.txt
@@ -0,0 +1,21 @@
+SHORT  Vectorization intensity
+
+EVENTSET
+PMC0  VPU_INSTRUCTIONS_EXECUTED
+PMC1  VPU_ELEMENTS_ACTIVE
+
+METRICS
+Runtime (RDTSC) [s] time
+Vectorization intensity PMC1/PMC0
+
+LONG
+Formulas:
+Vectorization intensity = VPU_ELEMENTS_ACTIVE / VPU_INSTRUCTIONS_EXECUTED
+-
+Vector instructions include instructions that perform floating-point
+operations, instructions that load vector registers from memory and store them
+to memory, instructions to manipulate vector mask registers, and other special
+purpose instructions such as vector shuffle.
+According to the book 'Intel Xeon Phi Coprocessor High-Performance Programming'
+by James Jeffers and James Reinders, the vectorization intensity should be >=8
+for double precision and >=16 for single precision.
diff --git a/collectors/likwid/groups/phi/VECTOR2.txt b/collectors/likwid/groups/phi/VECTOR2.txt
new file mode 100644
index 0000000..52b3c59
--- /dev/null
+++ b/collectors/likwid/groups/phi/VECTOR2.txt
@@ -0,0 +1,20 @@
+SHORT  Vector unit usage
+
+EVENTSET
+PMC0  VPU_INSTRUCTIONS_EXECUTED
+PMC1  VPU_STALL_REG
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]  PMC1*inverseClock
+VPU stall ratio [%] 100*(VPU_STALL_REG/PMC0)
+
+LONG
+Formulas:
+VPU stall ratio [%] = 100*(VPU_STALL_REG/VPU_INSTRUCTIONS_EXECUTED)
+--
+This group measures how efficient the processor works with
+regard to vectorization instruction throughput. The event VPU_STALL_REG counts
+the VPU stalls due to data dependencies. Dependencies are read-after-write,
+write-after-write and write-after-read.
+
diff --git a/collectors/likwid/groups/phi/VPU_FILL_RATIO_DBL.txt b/collectors/likwid/groups/phi/VPU_FILL_RATIO_DBL.txt
new file mode 100644
index 0000000..6e8065c
--- /dev/null
+++ b/collectors/likwid/groups/phi/VPU_FILL_RATIO_DBL.txt
@@ -0,0 +1,18 @@
+SHORT VPU filling for double precisiof data
+
+EVENTSET
+PMC0  VPU_INSTRUCTIONS_EXECUTED
+PMC1  VPU_ELEMENTS_ACTIVE
+
+METRICS
+Runtime (RDTSC) [s] time
+VPU fill ratio PMC0*8/PMC1
+
+LONG
+Formulas:
+VPU fill ratio = VPU_INSTRUCTIONS_EXECUTED*8/VPU_ELEMENTS_ACTIVE
+--
+This performance group measures the number of vector instructions that are
+performed on each vector loaded to the VPU. It is important to increate the
+ratio to get a high throughput because memory accesses (loading data to the VPU)
+are expensive.
diff --git a/collectors/likwid/groups/phi/VPU_PAIRING.txt b/collectors/likwid/groups/phi/VPU_PAIRING.txt
new file mode 100644
index 0000000..024919b
--- /dev/null
+++ b/collectors/likwid/groups/phi/VPU_PAIRING.txt
@@ -0,0 +1,20 @@
+SHORT VPU pairing ratio
+
+EVENTSET
+PMC0  VPU_INSTRUCTIONS_EXECUTED
+PMC1  VPU_INSTRUCTIONS_EXECUTED_V_PIPE
+
+METRICS
+Runtime (RDTSC) [s] time
+V-pipe ratio   PMC1/PMC0
+Pairing ratio PMC1/(PMC0-PMC1)
+
+LONG
+Formulas:
+V-pipe ratio = VPU_INSTRUCTIONS_EXECUTED_V_PIPE/VPU_INSTRUCTIONS_EXECUTED
+Pairing ratio = VPU_INSTRUCTIONS_EXECUTED_V_PIPE/(VPU_INSTRUCTIONS_EXECUTED-VPU_INSTRUCTIONS_EXECUTED_V_PIPE)
+--
+This performance group measures the pairing ratio of vector instructions. The
+V-pipe can only execute a subset of all instruction, the main workload is done
+by the U-pipe. A higher throughput can be achieved if the pairing ratio is
+increased.
diff --git a/collectors/likwid/groups/phi/VPU_READ_MISS_RATIO.txt b/collectors/likwid/groups/phi/VPU_READ_MISS_RATIO.txt
new file mode 100644
index 0000000..cf04c5f
--- /dev/null
+++ b/collectors/likwid/groups/phi/VPU_READ_MISS_RATIO.txt
@@ -0,0 +1,16 @@
+SHORT Miss ratio for VPU data reads
+
+EVENTSET
+PMC0  VPU_DATA_READ
+PMC1  VPU_DATA_READ_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+VPU read miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+VPU read miss ratio = PMC1/PMC0
+--
+This performance group determines the ratio between reads and reads that miss
+the cache and are issued by the VPU.
diff --git a/collectors/likwid/groups/phi/VPU_WRITE_MISS_RATIO.txt b/collectors/likwid/groups/phi/VPU_WRITE_MISS_RATIO.txt
new file mode 100644
index 0000000..cebf3c7
--- /dev/null
+++ b/collectors/likwid/groups/phi/VPU_WRITE_MISS_RATIO.txt
@@ -0,0 +1,16 @@
+SHORT Miss ratio for VPU data writes
+
+EVENTSET
+PMC0  VPU_DATA_WRITE
+PMC1  VPU_DATA_WRITE_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+VPU write miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+VPU write miss ratio = PMC1/PMC0
+--
+This performance group determines the ratio between writes and writes that miss
+the cache and are issued by the VPU.
diff --git a/collectors/likwid/groups/phi/WRITE_MISS_RATIO.txt b/collectors/likwid/groups/phi/WRITE_MISS_RATIO.txt
new file mode 100644
index 0000000..1e92c76
--- /dev/null
+++ b/collectors/likwid/groups/phi/WRITE_MISS_RATIO.txt
@@ -0,0 +1,15 @@
+SHORT Miss ratio fof data writes
+
+EVENTSET
+PMC0  DATA_WRITE
+PMC1  DATA_WRITE_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Write miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+Write miss ratio = DATA_WRITE_MISS/DATA_WRITE
+--
+Miss ratio fof data writes.
diff --git a/collectors/likwid/groups/power8/BRANCH.txt b/collectors/likwid/groups/power8/BRANCH.txt
new file mode 100644
index 0000000..870bb9d
--- /dev/null
+++ b/collectors/likwid/groups/power8/BRANCH.txt
@@ -0,0 +1,30 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+PMC0  PM_BR_PRED_BR_CMPL
+PMC1  PM_BR_PRED_CCACHE_CMPL
+PMC2  PM_BR_PRED_CR_CMPL
+PMC3  PM_BR_MPRED_CMPL
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+Branch rate   (PMC0+PMC1+PMC2)/PMC4
+Branch misprediction rate  PMC3/PMC4
+Branch misprediction ratio  PMC4/(PMC0+PMC1+PMC2)
+Instructions per branch  PMC4/(PMC0+PMC1+PMC2)
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often in average a branch or a mispredicted branch occured
+per instruction retired in total. The Branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/Branch rate.
+
diff --git a/collectors/likwid/groups/power8/CPISTACK1.txt b/collectors/likwid/groups/power8/CPISTACK1.txt
new file mode 100644
index 0000000..aa8a643
--- /dev/null
+++ b/collectors/likwid/groups/power8/CPISTACK1.txt
@@ -0,0 +1,35 @@
+SHORT First level of IBM CPI stack 
+
+EVENTSET
+PMC0  PM_CMPLU_STALL_THRD 
+PMC1  PM_GCT_EMPTY_CYC
+PMC3  PM_CMPLU_STALL
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+CPI  PMC5/PMC4
+Stall cycles PMC3
+Stall cycle ratio PMC3/PMC5
+Thread blocked cycles PMC0
+Thread blocked cycle ratio PMC0/PMC5
+GCT empty cycles PMC1
+GCT empty cycle ratio PMC1/PM5
+
+
+
+
+LONG
+Formulas:
+Stall cycles = PM_CMPLU_STALL
+Stall cycle ratio = PM_CMPLU_STALL/PM_RUN_CYC
+Thread blocked cycles = PM_CMPLU_STALL_THRD
+Thread blocked cycle ratio = PM_CMPLU_STALL_THRD/PM_RUN_CYC
+GCT empty cycles = PM_GCT_EMPTY_CYC
+GCT empty cycle ratio = PM_GCT_EMPTY_CYC/PM_RUN_CYC
+--
+First level of IBM CPI stack. IBM names Stalled Cycles, Waiting to Complete,
+Thread Blocked, Completion Table Empty, Other and Completion Cycles. For some
+there are no clearly identifiable events, so this group concentrates on
+Stalled Cycles (PM_CMPLU_STALL), Thread Blocked (PM_CMPLU_STALL_THRD),
+Completion Table Empty (PM_GCT_EMPTY_CYC) and Other (PM_CMPLU_STALL_OTHER_CMPL).
diff --git a/collectors/likwid/groups/power8/DATA.txt b/collectors/likwid/groups/power8/DATA.txt
new file mode 100644
index 0000000..bc3b893
--- /dev/null
+++ b/collectors/likwid/groups/power8/DATA.txt
@@ -0,0 +1,23 @@
+SHORT Load to store ratio
+
+EVENTSET
+PMC0  PM_LD_CMPL
+PMC1  PM_ST_CMPL
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+Load to store ratio PMC0/PMC1
+Load ratio PMC0/PMC4
+Store ratio PMC1/PMC4
+
+LONG
+Formulas:
+Load to store ratio = PM_LD_CMPL/PM_ST_CMPL
+Load ratio = PM_LD_CMPL/PM_RUN_INST_CMPL
+Store ratio = PM_ST_CMPL/PM_RUN_INST_CMPL
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/power8/FLOPS_1_2.txt b/collectors/likwid/groups/power8/FLOPS_1_2.txt
new file mode 100644
index 0000000..27138d5
--- /dev/null
+++ b/collectors/likwid/groups/power8/FLOPS_1_2.txt
@@ -0,0 +1,24 @@
+SHORT Group 121 as used in IBM Parallel Environment Developer Edition
+
+EVENTSET
+PMC0  PM_VSU0_1FLOP
+PMC1  PM_VSU1_1FLOP
+PMC2  PM_VSU0_2FLOP
+PMC3  PM_VSU1_2FLOP
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+CPI PMC5/PMC4
+One FLOP ops PMC0+PMC1
+Two FLOPs ops PMC2+PMC3
+[MFLOP/s]  1E-6*(PMC0+PMC1+((PMC2+PMC3)*2))/time
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+One FLOP ops = PM_VSU0_1FLOP+PM_VSU1_1FLOP
+Two FLOPs ops = PM_VSU0_2FLOP+PM_VSU1_2FLOP
+[MFLOP/s] = 1E-6*(PM_VSU0_1FLOP+PM_VSU1_1FLOP+((PM_VSU0_2FLOP+PM_VSU1_2FLOP)*2))/time
+--
+Group 121 from web page http://www.ibm.com/support/knowledgecenter/en/SSFK5S_2.2.0/com.ibm.cluster.pedev.v2r2.pedev100.doc/bl7ug_power8metrics.htm
diff --git a/collectors/likwid/groups/power8/FLOPS_4_8.txt b/collectors/likwid/groups/power8/FLOPS_4_8.txt
new file mode 100644
index 0000000..70e600a
--- /dev/null
+++ b/collectors/likwid/groups/power8/FLOPS_4_8.txt
@@ -0,0 +1,24 @@
+SHORT Group 122 as used in IBM Parallel Environment Developer Edition
+
+EVENTSET
+PMC0  PM_VSU0_4FLOP
+PMC1  PM_VSU1_4FLOP
+PMC2  PM_VSU0_8FLOP
+PMC3  PM_VSU1_8FLOP
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+CPI PMC5/PMC4
+Four FLOPs ops PMC0+PMC1
+Eight FLOPs ops PMC2+PMC3
+MFLOP/s 1E-6*(((PMC0+PMC1)*4.0)+((PMC2+PMC3)*8.0))/time
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+Four FLOPs ops = PM_VSU0_4FLOP+PM_VSU1_4FLOP
+Eight FLOPs ops = PM_VSU0_8FLOP+PM_VSU1_8FLOP
+MFLOP/s = 1E-6*(((PM_VSU0_4FLOP+PM_VSU1_4FLOP)*4.0)+((PM_VSU0_8FLOP+PM_VSU1_8FLOP)*8.0))/time
+--
+Group 122 from web page http://www.ibm.com/support/knowledgecenter/en/SSFK5S_2.2.0/com.ibm.cluster.pedev.v2r2.pedev100.doc/bl7ug_power8metrics.htm
diff --git a/collectors/likwid/groups/power8/FLOPS_DP.txt b/collectors/likwid/groups/power8/FLOPS_DP.txt
new file mode 100644
index 0000000..8c3bfdf
--- /dev/null
+++ b/collectors/likwid/groups/power8/FLOPS_DP.txt
@@ -0,0 +1,27 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0  PM_VSU0_DP_2FLOP
+PMC1  PM_VSU0_DP_FMA
+PMC2  PM_VSU0_DP_FSQRT_FDIV
+PMC3  PM_VSU0_SCALAR_DP_ISSUED
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+DP [MFLOP/s]  1.0E-06*((PMC0*2.0)+PMC2+(PMC1*4.0))/time
+DP VSX [MFLOP/s]  1.0E-06*((PMC1*4.0)+(PMC0*2.0))/time
+Packed [MUOPS/s]   1.0E-06*(PMC1)/time
+Scalar [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+DP [MFLOP/s] = 1.0E-06*(PM_VSU0_SCALAR_DP_ISSUED+PM_VSU1_SCALAR_DP_ISSUED+(PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime
+DP VSX [MFLOP/s] = 1.0E-06*((PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)/runtime
+Scalar [MUOPS/s] = 1.0E-06*(PM_VSU0_SCALAR_DP_ISSUED+PM_VSU1_SCALAR_DP_ISSUED)/runtime
+--
+
diff --git a/collectors/likwid/groups/power8/FLOPS_DP2.txt b/collectors/likwid/groups/power8/FLOPS_DP2.txt
new file mode 100644
index 0000000..69ca9e2
--- /dev/null
+++ b/collectors/likwid/groups/power8/FLOPS_DP2.txt
@@ -0,0 +1,27 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0  PM_VSU1_DP_2FLOP
+PMC1  PM_VSU1_DP_FMA
+PMC2  PM_VSU1_DP_FSQRT_FDIV
+PMC3  PM_VSU1_SCALAR_DP_ISSUED
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+DP [MFLOP/s]  1.0E-06*(PMC0+PMC2+(PMC1)*4.0)/time
+DP VSX [MFLOP/s]  1.0E-06*((PMC1)*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC1)/time
+Scalar [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+DP [MFLOP/s] = 1.0E-06*(PM_VSU0_SCALAR_DP_ISSUED+PM_VSU1_SCALAR_DP_ISSUED+(PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime
+DP VSX [MFLOP/s] = 1.0E-06*((PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)/runtime
+Scalar [MUOPS/s] = 1.0E-06*(PM_VSU0_SCALAR_DP_ISSUED+PM_VSU1_SCALAR_DP_ISSUED)/runtime
+--
+
diff --git a/collectors/likwid/groups/power8/FLOPS_FMA.txt b/collectors/likwid/groups/power8/FLOPS_FMA.txt
new file mode 100644
index 0000000..8bf5234
--- /dev/null
+++ b/collectors/likwid/groups/power8/FLOPS_FMA.txt
@@ -0,0 +1,28 @@
+SHORT Group 124 as used in IBM Parallel Environment Developer Edition
+
+EVENTSET
+PMC0  PM_VSU0_DP_FMA
+PMC1  PM_VSU1_DP_FMA
+PMC2  PM_VSU0_FMA
+PMC3  PM_VSU1_FMA
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+CPI PMC5/PMC4
+DP FMAs PMC0+PMC1
+Scalar FMAs PMC2+PMC3
+DP FMA [MFLOP/s] 1E-6*(PMC0+PMC1)*4.0/time
+Scalar FMA [MFLOP/s] 1E-6*(PMC2+PMC3)*2.0/time
+[MFLOP/s] 1E-6*(((PMC0+PMC1)*4.0)+((PMC2+PMC3)*2.0))/time
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+DP FMAs = PM_VSU0_DP_FMA+PM_VSU1_DP_FMA
+Scalar FMAs = PM_VSU0_FMA+PM_VSU1_FMA
+DP FMA [MFLOP/s] = 1E-6*(PM_VSU0_DP_FMA+PM_VSU1_DP_FMA)*4.0/runtime
+Scalar FMA [MFLOP/s] = 1E-6*(PM_VSU0_FMA+PM_VSU1_FMA)*2.0/runtime
+[MFLOP/s] = 1E-6*(((PM_VSU0_DP_FMA+PM_VSU1_DP_FMA)*4.0)+((PM_VSU0_FMA+PM_VSU1_FMA)*2.0))/runtime
+--
+Group 124 from web page http://www.ibm.com/support/knowledgecenter/en/SSFK5S_2.2.0/com.ibm.cluster.pedev.v2r2.pedev100.doc/bl7ug_power8metrics.htm
diff --git a/collectors/likwid/groups/power8/FLOPS_SP.txt b/collectors/likwid/groups/power8/FLOPS_SP.txt
new file mode 100644
index 0000000..19bcd5c
--- /dev/null
+++ b/collectors/likwid/groups/power8/FLOPS_SP.txt
@@ -0,0 +1,27 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0  PM_VSU0_SINGLE
+PMC1  PM_VSU0_VECTOR_SP_ISSUED
+PMC2  PM_VSU1_SINGLE
+PMC3  PM_VSU1_VECTOR_SP_ISSUED
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+SP [MFLOP/s]  1.0E-06*(((PMC0-PMC1)+(PMC2-PMC3))*4.0+(PMC1+PMC3)*8.0)/time
+SP VSX [MFLOP/s]  1.0E-06*((PMC1+PMC3)*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC1+PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*(PMC0+PMC2)/time
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+SP [MFLOP/s] = 1.0E-06*(PM_VSU0_SINGLE+PM_VSU1_SINGLE+(PM_VSU0_VECTOR_SP_ISSUED+PM_VSU1_VECTOR_SP_ISSUED)*8)/runtime
+SP VSX [MFLOP/s] = 1.0E-06*((PM_VSU0_VECTOR_SP_ISSUED+PM_VSU1_VECTOR_SP_ISSUED)*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(PM_VSU0_VECTOR_SP_ISSUED+PM_VSU1_VECTOR_SP_ISSUED)/runtime
+Scalar [MUOPS/s] = 1.0E-06*(PM_VSU0_SINGLE+PM_VSU1_SINGLE)/runtime
+--
+
diff --git a/collectors/likwid/groups/power8/FLOPS_VSU0.txt b/collectors/likwid/groups/power8/FLOPS_VSU0.txt
new file mode 100644
index 0000000..fa94626
--- /dev/null
+++ b/collectors/likwid/groups/power8/FLOPS_VSU0.txt
@@ -0,0 +1,23 @@
+SHORT Double Precision MFlops/s performed by VSU pipe 0
+
+EVENTSET
+PMC0  PM_VSU0_DP_2FLOP
+PMC1  PM_VSU0_DP_FMA
+PMC2  PM_VSU0_DP_FSQRT_FDIV
+PMC3  PM_VSU0_1FLOP
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+[MFLOP/s]  1.0E-06*((PMC0*2.0)+(PMC2*8.0)+(PMC1*4.0)+PMC3)/time
+VSX [MFLOP/s]  1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+[MFLOP/s] = 1.0E-06*(PM_VSU0_SCALAR_DP_ISSUED+PM_VSU0_SCALAR_DP_ISSUED+(PM_VSU0_VECTOR_DP_ISSUED+PM_VSU0_VECTOR_DP_ISSUED)*4)/runtime
+VSX [MFLOP/s] = 1.0E-06*((PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime
+--
+
diff --git a/collectors/likwid/groups/power8/FLOPS_VSU1.txt b/collectors/likwid/groups/power8/FLOPS_VSU1.txt
new file mode 100644
index 0000000..617ab88
--- /dev/null
+++ b/collectors/likwid/groups/power8/FLOPS_VSU1.txt
@@ -0,0 +1,22 @@
+SHORT Double Precision MFlops/s performed by VSU pipe 1
+
+EVENTSET
+PMC0  PM_VSU1_DP_2FLOP
+PMC1  PM_VSU1_DP_FMA
+PMC2  PM_VSU1_DP_FSQRT_FDIV
+PMC3  PM_VSU1_1FLOP
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+[MFLOP/s]  1.0E-06*((PMC0*2.0)+(PMC2*8.0)+(PMC1*4.0)+PMC3)/time
+VSX [MFLOP/s]  1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+[MFLOP/s] = 1.0E-06*(PM_VSU1_SCALAR_DP_ISSUED+PM_VSU1_SCALAR_DP_ISSUED+(PM_VSU1_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime
+VSX [MFLOP/s] = 1.0E-06*((PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4)/runtime
+--
diff --git a/collectors/likwid/groups/power8/FLOPS_VSX.txt b/collectors/likwid/groups/power8/FLOPS_VSX.txt
new file mode 100644
index 0000000..063ad0c
--- /dev/null
+++ b/collectors/likwid/groups/power8/FLOPS_VSX.txt
@@ -0,0 +1,29 @@
+SHORT Vectorized MFlops/s
+
+EVENTSET
+PMC0  PM_VSU0_VECTOR_DP_ISSUED
+PMC1  PM_VSU1_VECTOR_DP_ISSUED
+PMC2  PM_VSU0_VECTOR_SP_ISSUED
+PMC3  PM_VSU1_VECTOR_SP_ISSUED
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+[MFLOP/s]  1.0E-06*((PMC0+PMC1)*4.0+(PMC2+PMC3)*8.0)/time
+DP [MFLOP/s]  1.0E-06*((PMC0+PMC1)*4.0)/time
+SP [MFLOP/s]  1.0E-06*((PMC2+PMC3)*8.0)/time
+DP [MUOPS/s]   1.0E-06*(PMC0+PMC1)/time
+SP [MUOPS/s]   1.0E-06*(PMC2+PMC3)/time
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+[MFLOP/s] = 1.0E-06*((PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4.0+(PM_VSU0_VECTOR_SP_ISSUED+PM_VSU1_VECTOR_SP_ISSUED)*8.0)/runtime
+DP [MFLOP/s] = 1.0E-06*((PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)*4.0)/runtime
+SP [MFLOP/s] = 1.0E-06*((PM_VSU0_VECTOR_SP_ISSUED+PM_VSU1_VECTOR_SP_ISSUED)*8.0)/runtime
+DP [MUOPS/s] = 1.0E-06*(PM_VSU0_VECTOR_DP_ISSUED+PM_VSU1_VECTOR_DP_ISSUED)/runtime
+SP [MUOPS/s] = 1.0E-06*(PM_VSU0_VECTOR_SP_ISSUED+PM_VSU1_VECTOR_SP_ISSUED)/runtime
+--
+
diff --git a/collectors/likwid/groups/power8/ICACHE.txt b/collectors/likwid/groups/power8/ICACHE.txt
new file mode 100644
index 0000000..7a07fd4
--- /dev/null
+++ b/collectors/likwid/groups/power8/ICACHE.txt
@@ -0,0 +1,22 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+PMC0  PM_INST_FROM_L1
+PMC1  PM_L1_ICACHE_MISS
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L1I request rate PMC0/PMC4
+L1I miss rate PMC1/PMC4
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+-
+This group measures some L1 instruction cache metrics.
diff --git a/collectors/likwid/groups/power8/L1.txt b/collectors/likwid/groups/power8/L1.txt
new file mode 100644
index 0000000..19dc36e
--- /dev/null
+++ b/collectors/likwid/groups/power8/L1.txt
@@ -0,0 +1,33 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  PM_LD_REF_L1 
+PMC1  PM_ST_CMPL
+PMC2  PM_LSU_L1_PREF
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L2D load bandwidth [MBytes/s]  1.0E-06*((PMC0+PMC2)/2)*64.0/time
+L2D load data volume [GBytes]  1.0E-09*((PMC0+PMC2)/2)*64.0
+L2D store bandwidth [MBytes/s]  1.0E-06*((PMC1/2))*64.0/time
+L2D store data volume [GBytes]  1.0E-09*((PMC1/2))*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*((PMC1+PMC0+PMC2)/2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*((PMC1+PMC0+PMC2)/2)*64.0
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+L2D load bandwidth [MBytes/s] = 1.0E-06*(PM_DATA_FROM_L2/2)*128.0/time
+L2D load data volume [GBytes] = 1.0E-09*(PM_DATA_FROM_L2/2)*128.0
+L2D store bandwidth [MBytes/s] = 1.0E-06*(PM_ST_MISS_L1)*128.0/time
+L2D store data volume [GBytes] = 1.0E-09*(PM_ST_MISS_L1)*128.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(PM_DATA_FROM_L2/2 + PM_ST_MISS_L1)*128.0/time
+L2 data volume [GBytes] = 1.0E-09*(PM_DATA_FROM_L2/2 + PM_ST_MISS_L1)*128.0
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cacheline loaded from the L2 to the L1 data cache. There is currently no
+event to get the evicted data volume.
diff --git a/collectors/likwid/groups/power8/L2.txt b/collectors/likwid/groups/power8/L2.txt
new file mode 100644
index 0000000..d5af584
--- /dev/null
+++ b/collectors/likwid/groups/power8/L2.txt
@@ -0,0 +1,32 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  PM_L2_ST 
+PMC2  PM_LD_MISS_L1
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L2D load bandwidth [MBytes/s]  1.0E-06*(PMC2/2)*128.0/time
+L2D load data volume [GBytes]  1.0E-09*(PMC2/2)*128.0
+L2D store bandwidth [MBytes/s]  1.0E-06*(PMC0/2)*128.0/time
+L2D store data volume [GBytes]  1.0E-09*(PMC0/2)*128.0
+L2 bandwidth [MBytes/s] 1.0E-06*((PMC0+PMC2)/2)*128.0/time
+L2 data volume [GBytes] 1.0E-09*((PMC0+PMC2)/2)*128.0
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+L2D load bandwidth [MBytes/s] = 1.0E-06*(PM_DATA_FROM_L2/2)*128.0/time
+L2D load data volume [GBytes] = 1.0E-09*(PM_DATA_FROM_L2/2)*128.0
+L2D store bandwidth [MBytes/s] = 1.0E-06*(PM_ST_CMPL/2)*128.0/time
+L2D store data volume [GBytes] = 1.0E-09*(PM_ST_CMPL/2)*128.0
+L2 bandwidth [MBytes/s] = 1.0E-06*((PM_DATA_FROM_L2 + PM_ST_CMPL))*128.0/time
+L2 data volume [GBytes] = 1.0E-09*((PM_DATA_FROM_L2 + PM_ST_CMPL))*128.0
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cacheline loaded from the L2 to the L1 data cache. There is currently no
+event to get the evicted data volume.
diff --git a/collectors/likwid/groups/power8/L2CACHE.txt b/collectors/likwid/groups/power8/L2CACHE.txt
new file mode 100644
index 0000000..47bcedd
--- /dev/null
+++ b/collectors/likwid/groups/power8/L2CACHE.txt
@@ -0,0 +1,40 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+PMC0  PM_L2_ST_MISS
+PMC1  PM_L2_LD_MISS
+PMC2  PM_L2_LD_DISP
+PMC3  PM_L2_ST_DISP
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L2 request rate = (PMC2+PMC3)/PMC4
+L2 miss rate = (PMC0+PMC1)/PMC4
+L2 miss ratio = (PMC0+PMC1)/(PMC2+PMC3)
+
+LONG
+Formulas:
+L2 request rate = (PM_L2_LD_DISP+PM_L2_ST_DISP)/PM_RUN_INST_CMPL
+L2 miss rate = (PM_L2_LD_MISS+PM_L2_ST_MISS)/PM_RUN_INST_CMPL
+L2 miss ratio = (PM_L2_LD_MISS+PM_L2_ST_MISS)/(PM_L2_LD_DISP+PM_L2_ST_DISP)
+L2 load request rate = PM_L2_LD_DISP/PM_RUN_INST_CMPL
+L2 store request rate = PM_L2_ST_DISP/PM_RUN_INST_CMPL
+L2 load miss rate = PM_L2_LD_MISS/PM_RUN_INST_CMPL
+L2 store miss rate = PM_L2_ST_DISP/PM_RUN_INST_CMPL
+L2 load miss ratio = PM_L2_LD_MISS/(PM_L2_LD_DISP+PM_L2_ST_DISP)
+L2 store miss ratio = PM_L2_ST_MISS/(PM_L2_LD_DISP+PM_L2_ST_DISP)
+-
+This group measures the locality of your data accesses with regard to the
+L2 Cache. L2 request rate tells you how data intensive your code is
+or how many Data accesses you have in average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cachelines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cacheline to be loaded from a higher level.
+While the Data cache miss rate might be given by your algorithm you should
+try to get Data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/power8/L3.txt b/collectors/likwid/groups/power8/L3.txt
new file mode 100644
index 0000000..0737c44
--- /dev/null
+++ b/collectors/likwid/groups/power8/L3.txt
@@ -0,0 +1,31 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  PM_L3_LD_PREF
+PMC3  PM_DATA_FROM_L3
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L3D load bandwidth [MBytes/s]  1.0E-06*(PMC3+(PMC0-PMC3))*128.0/time
+L3D load data volume [GBytes]  1.0E-09*(PMC3+(PMC0-PMC3))*128.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC3+(PMC0-PMC3))*128.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC3+(PMC0-PMC3))*128.0
+Loads from local L3 per cycle 100.0*(PMC3+(PMC0-PMC3))/PMC5
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+L3D load bandwidth [MBytes/s] = 1.0E-06*(PM_DATA_FROM_L3)*128.0/time
+L3D load data volume [GBytes] = 1.0E-09*(PM_DATA_FROM_L3)*128.0
+L3D evict bandwidth [MBytes/s] = 1.0E-06*(PM_L2_CASTOUT_MOD)*128.0/time
+L3D evict data volume [GBytes] = 1.0E-09*(PM_L2_CASTOUT_MOD)*128.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(PM_DATA_FROM_L3+PM_L2_CASTOUT_MOD)*128.0/time
+L3 data volume [GBytes] = 1.0E-09*(PM_DATA_FROM_L3+PM_L2_CASTOUT_MOD)*128.0
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cacheline loaded from the L3 to the L2 data cache. There is currently no
+event to get the evicted data volume.
diff --git a/collectors/likwid/groups/power8/MEM.txt b/collectors/likwid/groups/power8/MEM.txt
new file mode 100644
index 0000000..4831c80
--- /dev/null
+++ b/collectors/likwid/groups/power8/MEM.txt
@@ -0,0 +1,30 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+PMC0  PM_L3_CO_MEPF
+PMC1  PM_DATA_ALL_FROM_MEMORY
+PMC3  PM_L3_PF_ON_CHIP_MEM
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+Memory load bandwidth [MBytes/s] 1.0E-06*(PMC1+PMC3)*128.0/time
+Memory load data volume [GBytes] 1.0E-09*(PMC1+PMC3)*128.0
+Memory evict bandwidth [MBytes/s] 1.0E-06*(PMC0)*128.0/time
+Memory evict data volume [GBytes] 1.0E-09*(PMC0)*128.0
+Memory bandwidth [MBytes/s] 1.0E-06*(PMC1+PMC3+PMC0)*128.0/time
+Memory data volume [GBytes] 1.0E-09*(PMC1+PMC3+PMC0)*128.0
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC / PM_RUN_INST_CMPL
+Memory load bandwidth [MBytes/s] = 1.0E-06* (PM_DATA_ALL_FROM_MEMORY)*128/time
+Memory load data volume [GBytes] = 1.0E-09* (PM_DATA_ALL_FROM_MEMORY)*128
+Memory evict bandwidth [MBytes/s] = 1.0E-06* (PM_MEM_CO)*128/time
+Memory evict data volume [GBytes] = 1.0E-09* (PM_MEM_CO)*128
+Memory bandwidth [MBytes/s] = 1.0E-06* (PM_DATA_ALL_FROM_MEMORY+PM_MEM_CO)*128/time
+Memory data volume [GBytes] = 1.0E-09* (PM_DATA_ALL_FROM_MEMORY+PM_MEM_CO)*128
+--
+This group uses the core-local events to measure data traffic from memory.
diff --git a/collectors/likwid/groups/power8/NUMA.txt b/collectors/likwid/groups/power8/NUMA.txt
new file mode 100644
index 0000000..e2a6e71
--- /dev/null
+++ b/collectors/likwid/groups/power8/NUMA.txt
@@ -0,0 +1,29 @@
+SHORT Memory bandwidth in MBytes/s for local and remote memory
+
+EVENTSET
+PMC1  PM_DATA_ALL_FROM_LMEM
+PMC3  PM_DATA_ALL_FROM_DMEM
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+Local bandwidth [MBytes/s] 1.0E-06*(PMC1)*128.0/time
+Local data volume [GBytes] 1.0E-09*(PMC1)*128.0
+Remote bandwidth [MBytes/s] 1.0E-06*(PMC3)*128.0/time
+Remote data volume [GBytes] 1.0E-09*(PMC3)*128.0
+Memory load bandwidth [MBytes/s] 1.0E-06*(PMC1+PMC3)*128.0/time
+Memory load data volume [GBytes] 1.0E-09*(PMC1+PMC3)*128.0
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC / PM_RUN_INST_CMPL
+Memory load bandwidth [MBytes/s] = 1.0E-06* (PM_DATA_ALL_FROM_MEMORY)*128/time
+Memory load data volume [GBytes] = 1.0E-09* (PM_DATA_ALL_FROM_MEMORY)*128
+Memory evict bandwidth [MBytes/s] = 1.0E-06* (PM_MEM_CO)*128/time
+Memory evict data volume [GBytes] = 1.0E-09* (PM_MEM_CO)*128
+Memory bandwidth [MBytes/s] = 1.0E-06* (PM_DATA_ALL_FROM_MEMORY+PM_MEM_CO)*128/time
+Memory data volume [GBytes] = 1.0E-09* (PM_DATA_ALL_FROM_MEMORY+PM_MEM_CO)*128
+--
+This group measures the NUMA traffic by separating local from remote memory data transfers.
diff --git a/collectors/likwid/groups/power8/STALLS1.txt b/collectors/likwid/groups/power8/STALLS1.txt
new file mode 100644
index 0000000..6acf949
--- /dev/null
+++ b/collectors/likwid/groups/power8/STALLS1.txt
@@ -0,0 +1,33 @@
+SHORT Completion stalls (group 1)
+
+EVENTSET
+PMC0 PM_CMPLU_STALL_THRD
+PMC1 PM_CMPLU_STALL_DCACHE_MISS
+PMC2 PM_CMPLU_STALL_COQ_FULL
+PMC3 PM_CMPLU_STALL
+PMC4 PM_RUN_INST_CMPL
+PMC5 PM_RUN_CYC
+
+METRICS
+Runtime time
+CPI  PMC5/PMC4
+Completion stall cycles PMC3
+Stall cycles by thread conflict PMC0
+Stall ratio by thread conflict [%] PMC0/PMC3*100.0
+Stall cycles by d-cache miss PMC1
+Stall ratio by d-cache miss [%] PMC1/PMC3*100.0
+Stall cycles by full castout queue PMC2
+Stall ratio by full castout queue [%] PMC2/PMC3*100.0
+
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC / PM_RUN_INST_CMPL
+Completion stall cycles = PM_CMPLU_STALL
+Stall cycles by thread conflict = PM_CMPLU_STALL_THRD
+Stall ratio by thread conflict [%] = PM_CMPLU_STALL_THRD/PM_CMPLU_STALL*100
+Stall cycles by d-cache miss = PM_CMPLU_STALL_DCACHE_MISS
+Stall ratio by d-cache miss [%] = PM_CMPLU_STALL_DCACHE_MISS/PM_CMPLU_STALL*100
+Stall cycles by full castout queue = PM_CMPLU_STALL_COQ_FULL
+Stall ratio by full castout queue [%] = PM_CMPLU_STALL_COQ_FULL/PM_CMPLU_STALL*100
+--
diff --git a/collectors/likwid/groups/power8/STALLS2.txt b/collectors/likwid/groups/power8/STALLS2.txt
new file mode 100644
index 0000000..6329624
--- /dev/null
+++ b/collectors/likwid/groups/power8/STALLS2.txt
@@ -0,0 +1,32 @@
+SHORT Completion stalls (group 2)
+
+EVENTSET
+PMC0 PM_CMPLU_STALL
+PMC1 PM_CMPLU_STALL_LSU
+PMC2 PM_CMPLU_STALL_FLUSH
+PMC3 PM_CMPLU_STALL_BRU
+PMC4 PM_RUN_INST_CMPL
+PMC5 PM_RUN_CYC
+
+METRICS
+CPI  PMC5/PMC4
+Stall cycles PMC0
+Stall cycles by load/store unit PMC1
+Stall ratio by load/store unit [%] PMC1/PMC0*100.0
+Stall cycles by pipeline flush PMC2
+Stall ratio by pipeline flush [%] PMC2/PMC0*100.0
+Stall cycles by branch unit PMC3
+Stall ratio by branch unit [%] PMC3/PMC0*100.0
+
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC / PM_RUN_INST_CMPL
+Stall cycles = PM_CMPLU_STALL
+Stall cycles by load/store unit = PM_CMPLU_STALL_LSU
+Stall ratio by load/store unit [%] = PM_CMPLU_STALL_LSU/PM_CMPLU_STALL*100.0
+Stall cycles by pipeline flush = PM_CMPLU_STALL_FLUSH
+Stall ratio by pipeline flush [%] = PM_CMPLU_STALL_FLUSH/PM_CMPLU_STALL*100.0
+Stall cycles by branch unit = PM_CMPLU_STALL_BRU
+Stall ratio by branch unit [%] = PM_CMPLU_STALL_BRU/PM_CMPLU_STALL*100.0
+--
diff --git a/collectors/likwid/groups/power8/TLB_DATA.txt b/collectors/likwid/groups/power8/TLB_DATA.txt
new file mode 100644
index 0000000..c7df459
--- /dev/null
+++ b/collectors/likwid/groups/power8/TLB_DATA.txt
@@ -0,0 +1,37 @@
+SHORT  L1 Data TLB miss rate/ratio
+
+EVENTSET
+PMC0  PM_DTLB_MISS_16G
+PMC1  PM_DTLB_MISS_4K
+PMC2  PM_DTLB_MISS_64K
+PMC3  PM_DTLB_MISS_16M
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L1 DTLB 4K misses     PMC1
+L1 DTLB 4K miss rate  PMC1/PMC4
+L1 DTLB 64K misses     PMC2
+L1 DTLB 64K miss rate  PMC2/PMC4
+L1 DTLB 16M misses     PMC3
+L1 DTLB 16M miss rate  PMC3/PMC4
+L1 DTLB 16G misses     PMC0
+L1 DTLB 16G miss rate  PMC0/PMC4
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC / PM_RUN_INST_CMPL
+L1 DTLB 4K misses = PM_DTLB_MISS_4K
+L1 DTLB 4K miss rate = PM_DTLB_MISS_4K/PM_RUN_INST_CMPL
+L1 DTLB 64K misses = PM_DTLB_MISS_64K
+L1 DTLB 64K miss rate = PM_DTLB_MISS_64K/PM_RUN_INST_CMPL
+L1 DTLB 16M misses = PM_DTLB_MISS_16M
+L1 DTLB 16M miss rate = PM_DTLB_MISS_16M/PM_RUN_INST_CMPL
+L1 DTLB 16G misses = PM_DTLB_MISS_16G
+L1 DTLB 16G miss rate = PM_DTLB_MISS_16G/PM_RUN_INST_CMPL
+--
+The DTLB load and store miss rates gives a measure how often a TLB miss occured
+per instruction.
+
diff --git a/collectors/likwid/groups/power8/TLB_INSTR.txt b/collectors/likwid/groups/power8/TLB_INSTR.txt
new file mode 100644
index 0000000..3f8b79c
--- /dev/null
+++ b/collectors/likwid/groups/power8/TLB_INSTR.txt
@@ -0,0 +1,21 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+PMC2  PM_ITLB_MISS
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L1 ITLB misses     PMC2
+L1 ITLB miss rate  PMC2/PMC4
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC / PM_RUN_INST_CMPL
+L1 ITLB misses = PM_ITLB_MISS
+L1 ITLB miss rate = PM_ITLB_MISS/PM_RUN_INST_CMPL
+--
+The ITLB miss rate gives a measure how often a TLB miss occured per instruction.
+
diff --git a/collectors/likwid/groups/power8/USEFUL.txt b/collectors/likwid/groups/power8/USEFUL.txt
new file mode 100644
index 0000000..0b5fc8f
--- /dev/null
+++ b/collectors/likwid/groups/power8/USEFUL.txt
@@ -0,0 +1,24 @@
+SHORT Rate of useful instructions
+
+EVENTSET
+PMC0  PM_IOPS_CMPL
+PMC1  PM_INST_DISP
+PMC2  PM_IOPS_DISP
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+CPI  PMC5/PMC4
+Useful instr. rate PMC4/PMC1*100.0
+Useful uops rate PMC0/PMC2*100.0
+
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC / PM_RUN_INST_CMPL
+Useful instr. rate = PM_RUN_INST_CMPL/PM_INST_DISP*100.0
+Useful uops rate = PM_IOPS_CMPL/PM_IOPS_DISP*100.0
+--
+This groups measures how many of the dispatched instructions and internal operations (uops) are
+acutally completed. These metrics show the speculatively dispatches instructions compared to the
+completed instructions.
diff --git a/collectors/likwid/groups/power9/BRANCH.txt b/collectors/likwid/groups/power9/BRANCH.txt
new file mode 100644
index 0000000..1f6dd0d
--- /dev/null
+++ b/collectors/likwid/groups/power9/BRANCH.txt
@@ -0,0 +1,30 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+PMC1  PM_BR_PRED
+PMC2 PM_IOPS_CMPL
+PMC3  PM_BR_MPRED_CMPL
+PMC4 PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+Branch rate   (PMC1)/PMC4
+Branch misprediction rate  PMC3/PMC4
+Branch misprediction ratio  PMC3/(PMC1)
+Instructions per branch  PMC4/(PMC1)
+Operations per branch PMC2/PMC1
+
+LONG
+Formulas:
+Branch rate = PM_BR_PRED/PM_RUN_INST_CMPL
+Branch misprediction rate =  PM_BR_MPRED_CMPL/PM_RUN_INST_CMPL
+Branch misprediction ratio = PM_BR_MPRED_CMPL/PM_BR_PRED
+Instructions per branch = PM_RUN_INST_CMPL/PM_BR_PRED
+-
+The rates state how often in average a branch or a mispredicted branch occured
+per instruction retired in total. The Branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/Branch rate.
+
diff --git a/collectors/likwid/groups/power9/DATA.txt b/collectors/likwid/groups/power9/DATA.txt
new file mode 100644
index 0000000..a8a7cae
--- /dev/null
+++ b/collectors/likwid/groups/power9/DATA.txt
@@ -0,0 +1,23 @@
+SHORT Load to store ratio
+
+EVENTSET
+PMC3  PM_LD_CMPL
+PMC1  PM_ST_CMPL
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+Load to store ratio PMC3/PMC1
+Load rate PMC3/PMC4
+Store rate PMC1/PMC4
+
+LONG
+Formulas:
+Load to store ratio = PM_LD_CMPL/PM_ST_CMPL
+Load ratio = PM_LD_CMPL/PM_RUN_INST_CMPL
+Store ratio = PM_ST_CMPL/PM_RUN_INST_CMPL
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/power9/FLOPS.txt b/collectors/likwid/groups/power9/FLOPS.txt
new file mode 100644
index 0000000..ffaf11f
--- /dev/null
+++ b/collectors/likwid/groups/power9/FLOPS.txt
@@ -0,0 +1,25 @@
+SHORT SP/DP scalar/vector MFlops/s
+
+EVENTSET
+PMC3  PM_FLOP_CMPL
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+SP/DP [MFLOP/s] (scalar assumed) 1.0E-06*PMC3*2.0/time
+SP [MFLOP/s] (vector assumed) 1.0E-06*PMC3*8.0/time
+DP [MFLOP/s] (vector assumed) 1.0E-06*PMC3*4.0/time
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+SP/DP [MFLOP/s] (scalar assumed) = 1.0E-06*PM_FLOP_CMPL*2.0/runtime
+SP [MFLOP/s] (vector assumed) = 1.0E-06*PM_FLOP_CMPL*8.0/runtime
+DP [MFLOP/s] (vector assumed) = 1.0E-06*PM_FLOP_CMPL*4.0/runtime
+--
+This group counts floating-point operations. All is derived out of a
+single event PM_FLOP_CMPL, so if you have mixed usage of SP or DP and
+scalar and vector operations, the count won't be exact. With pure codes
+the counts are pretty accurate (e.g. when using likwid-bench).
diff --git a/collectors/likwid/groups/power9/FLOPS_FMA.txt b/collectors/likwid/groups/power9/FLOPS_FMA.txt
new file mode 100644
index 0000000..65e9b3b
--- /dev/null
+++ b/collectors/likwid/groups/power9/FLOPS_FMA.txt
@@ -0,0 +1,21 @@
+SHORT Floating-point operations with scalar FMA instuctions
+
+EVENTSET
+PMC3  PM_FMA_CMPL
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI PMC5/PMC4
+Scalar FMAs PMC3
+Scalar FMA [MFLOP/s] 1E-6*(PMC3)*2.0/time
+
+LONG
+Formulas:
+Scalar FMAs = PM_FMA_CMPL
+Scalar FMA [MFLOP/s] = 1E-6*(PM_FMA_CMPL)*2.0/runtime
+--
+This groups counts scalar FMA operations.
+PM_FMA_CMPL: Two-flops instruction completed (fmadd, fnmadd, fmsub,
+fnmsub). Scalar instructions only.
diff --git a/collectors/likwid/groups/power9/FLOPS_VSX.txt b/collectors/likwid/groups/power9/FLOPS_VSX.txt
new file mode 100644
index 0000000..594adf0
--- /dev/null
+++ b/collectors/likwid/groups/power9/FLOPS_VSX.txt
@@ -0,0 +1,23 @@
+SHORT Vectorized MFlops/s
+
+EVENTSET
+PMC1  PM_VSU_FIN
+PMC3  PM_VECTOR_FLOP_CMPL
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+SP [MFLOP/s] (assumed)  1.0E-06*(PMC3*8.0)/time
+DP [MFLOP/s] (assumed)  1.0E-06*(PMC3*4.0)/time
+Vector MIOPS/s   1.0E-06*(PMC1)/time
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+SP [MFLOP/s] (assumed) = 1.0E-06*(PM_VECTOR_FLOP_CMPL*4)/runtime
+DP [MFLOP/s] (assumed) = 1.0E-06*(PM_VECTOR_FLOP_CMPL*8)/runtime
+Vector MIOPS/s = 1.0E-06*(PM_VECTOR_FLOP_CMPL)/runtime
+--
+This group measures vector operations. There is no differentiation between SP and DP possible.
diff --git a/collectors/likwid/groups/power9/ICACHE.txt b/collectors/likwid/groups/power9/ICACHE.txt
new file mode 100644
index 0000000..7a07fd4
--- /dev/null
+++ b/collectors/likwid/groups/power9/ICACHE.txt
@@ -0,0 +1,22 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+PMC0  PM_INST_FROM_L1
+PMC1  PM_L1_ICACHE_MISS
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L1I request rate PMC0/PMC4
+L1I miss rate PMC1/PMC4
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+-
+This group measures some L1 instruction cache metrics.
diff --git a/collectors/likwid/groups/power9/L2CACHE.txt b/collectors/likwid/groups/power9/L2CACHE.txt
new file mode 100644
index 0000000..9873251
--- /dev/null
+++ b/collectors/likwid/groups/power9/L2CACHE.txt
@@ -0,0 +1,33 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+PMC1  PM_L2_LD_MISS
+PMC2  PM_L2_LD_DISP
+PMC3  PM_L2_ST_DISP
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L2 request rate (PMC2+PMC3)/PMC4
+L2 load miss rate PMC1/PMC4
+L2 load miss ratio PMC1/(PMC2+PMC3)
+
+LONG
+Formulas:
+L2 request rate = (PM_L2_LD_DISP+PM_L2_ST_DISP)/PM_RUN_INST_CMPL
+L2 load miss rate = (PM_L2_LD_MISS)/PM_RUN_INST_CMPL
+L2 load miss ratio = (PM_L2_LD_MISS)/(PM_L2_LD_DISP+PM_L2_ST_DISP)
+-
+This group measures the locality of your data accesses with regard to the
+L2 Cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have in average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cachelines from memory. And finally L2 load miss ratio tells you how many of your
+memory references required a cacheline to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/power9/L2LOAD.txt b/collectors/likwid/groups/power9/L2LOAD.txt
new file mode 100644
index 0000000..ebac5a2
--- /dev/null
+++ b/collectors/likwid/groups/power9/L2LOAD.txt
@@ -0,0 +1,23 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  PM_L2_LD
+PMC2  PM_L2_INST
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L2 load bandwidth [MBytes/s]  1.0E-06*(PMC0+PMC2)*128.0/time
+L2 load data volume [GBytes]  1.0E-09*(PMC0+PMC2)*128.0
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+L2 load bandwidth [MBytes/s] = 1.0E-06*(PM_L2_LD+PM_L2_INST)*128.0/time
+L2 load data volume [GBytes] = 1.0E-09*(PM_L2_LD+PM_L2_INST)*128.0
+-
+Profiling group to measure L2 load cache bandwidth. The bandwidth is computed by the
+number of cacheline loaded from L2 cache to L1.
diff --git a/collectors/likwid/groups/power9/L2STORE.txt b/collectors/likwid/groups/power9/L2STORE.txt
new file mode 100644
index 0000000..3b1c0af
--- /dev/null
+++ b/collectors/likwid/groups/power9/L2STORE.txt
@@ -0,0 +1,22 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  PM_L2_ST
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L2 store bandwidth [MBytes/s]  1.0E-06*(PMC0)*128.0/time
+L2 store data volume [GBytes]  1.0E-09*(PMC0)*128.0
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+L2 load bandwidth [MBytes/s] = 1.0E-06*(PM_L2_ST)*128.0/time
+L2 load data volume [GBytes] = 1.0E-09*(PM_L2_ST)*128.0
+-
+Profiling group to measure L2 store cache bandwidth. The bandwidth is computed by the
+number of cacheline stored from L1 cache to L2.
diff --git a/collectors/likwid/groups/power9/L3.txt b/collectors/likwid/groups/power9/L3.txt
new file mode 100644
index 0000000..cb97ead
--- /dev/null
+++ b/collectors/likwid/groups/power9/L3.txt
@@ -0,0 +1,29 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  PM_L3_LD_PREF
+PMC3  PM_DATA_FROM_L3
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L3D load bandwidth [MBytes/s]  1.0E-06*(PMC3+PMC0)*128.0/time
+L3D load data volume [GBytes]  1.0E-09*(PMC3+PMC0)*128.0
+Loads from local L3 per cycle 100.0*(PMC3+PMC0)/PMC5
+
+LONG
+Formulas:
+CPI = PM_RUN_CYC/PM_RUN_INST_CMPL
+L3D load bandwidth [MBytes/s] = 1.0E-06*(PM_DATA_FROM_L3)*128.0/time
+L3D load data volume [GBytes] = 1.0E-09*(PM_DATA_FROM_L3)*128.0
+L3D evict bandwidth [MBytes/s] = 1.0E-06*(PM_L2_CASTOUT_MOD)*128.0/time
+L3D evict data volume [GBytes] = 1.0E-09*(PM_L2_CASTOUT_MOD)*128.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(PM_DATA_FROM_L3+PM_L2_CASTOUT_MOD)*128.0/time
+L3 data volume [GBytes] = 1.0E-09*(PM_DATA_FROM_L3+PM_L2_CASTOUT_MOD)*128.0
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cacheline loaded from the L3 to the L2 data cache. There is currently no
+event to get the evicted data volume.
diff --git a/collectors/likwid/groups/power9/MEM.txt b/collectors/likwid/groups/power9/MEM.txt
new file mode 100644
index 0000000..022d39d
--- /dev/null
+++ b/collectors/likwid/groups/power9/MEM.txt
@@ -0,0 +1,47 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+PMC4 PM_RUN_INST_CMPL
+PMC5 PM_RUN_CYC
+MBOX0C0 PM_MBA0_READ_BYTES
+MBOX0C1 PM_MBA0_WRITE_BYTES
+MBOX1C0 PM_MBA1_READ_BYTES
+MBOX1C1 PM_MBA1_WRITE_BYTES
+MBOX2C0 PM_MBA2_READ_BYTES
+MBOX2C1 PM_MBA2_WRITE_BYTES
+MBOX3C0 PM_MBA3_READ_BYTES
+MBOX3C1 PM_MBA3_WRITE_BYTES
+MBOX4C0 PM_MBA4_READ_BYTES
+MBOX4C1 PM_MBA4_WRITE_BYTES
+MBOX5C0 PM_MBA5_READ_BYTES
+MBOX5C1 PM_MBA5_WRITE_BYTES
+MBOX6C0 PM_MBA6_READ_BYTES
+MBOX6C1 PM_MBA6_WRITE_BYTES
+MBOX7C0 PM_MBA7_READ_BYTES
+MBOX7C1 PM_MBA7_WRITE_BYTES
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(PM_MBAx_READ_BYTES))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(PM_MBAx_READ_BYTES))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(PM_MBAx_WRITE_BYTES))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(PM_MBAx_WRITE_BYTES))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(PM_MBAx_READ_BYTES)+SUM(PM_MBAx_WRITE_BYTES))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(PM_MBAx_READ_BYTES)+SUM(PM_MBAx_WRITE_BYTES))*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on a
+per socket base. Some of the counters may not be available on your system.
+Also outputs total data volume transferred from main memory.
diff --git a/collectors/likwid/groups/power9/TLB_DATA.txt b/collectors/likwid/groups/power9/TLB_DATA.txt
new file mode 100644
index 0000000..3d77654
--- /dev/null
+++ b/collectors/likwid/groups/power9/TLB_DATA.txt
@@ -0,0 +1,42 @@
+SHORT  L1 Data TLB miss rate/ratio
+
+EVENTSET
+PMC0  PM_LSU_DTLB_MISS_16G_1G
+PMC1  PM_LSU_DTLB_MISS_4K
+PMC2  PM_LSU_DTLB_MISS_64K
+PMC3  PM_LSU_DTLB_MISS_16M_2M
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L1 DTLB 4K misses     PMC1
+L1 DTLB 4K miss rate  PMC1/PMC4
+L1 DTLB 4K miss ratio [%] (PMC1/(PMC0+PMC1+PMC2+PMC3))*100.0
+L1 DTLB 64K misses     PMC2
+L1 DTLB 64K miss rate  PMC2/PMC4
+L1 DTLB 64K miss ratio [%] (PMC2/(PMC0+PMC1+PMC2+PMC3))*100.0
+L1 DTLB 16M/2M misses     PMC3
+L1 DTLB 16M/2M miss rate  PMC3/PMC4
+L1 DTLB 16M/2M miss ratio [%] (PMC3/(PMC0+PMC1+PMC2+PMC3))*100.0
+L1 DTLB 16G/1G misses     PMC0
+L1 DTLB 16G/1G miss rate  PMC0/PMC4
+L1 DTLB 16G/1G miss ratio [%] (PMC0/(PMC0+PMC1+PMC2+PMC3))*100.0
+
+LONG
+Formulas:
+L1 DTLB 4K misses = PM_LSU_DTLB_MISS_4K
+L1 DTLB 4K miss rate = PM_LSU_DTLB_MISS_4K/PM_RUN_INST_CMPL
+L1 DTLB 4K miss ratio [%] = (PM_LSU_DTLB_MISS_4K/(PM_LSU_DTLB_MISS_4K+PM_DTLB_MISS_64K+PM_DTLB_MISS_16M_2M+PM_DTLB_MISS_16G_1G))*100
+L1 DTLB 64K misses = PM_LSU_DTLB_MISS_64K
+L1 DTLB 64K miss rate = PM_LSU_DTLB_MISS_64K/PM_RUN_INST_CMPL
+L1 DTLB 64K miss ratio [%] = (PM_LSU_DTLB_MISS_64K/(PM_LSU_DTLB_MISS_4K+PM_DTLB_MISS_64K+PM_DTLB_MISS_16M_2M+PM_DTLB_MISS_16G_1G))*100
+L1 DTLB 4K misses = PM_LSU_DTLB_MISS_4K
+L1 DTLB 4K miss rate = PM_LSU_DTLB_MISS_4K/PM_RUN_INST_CMPL
+L1 DTLB 4K miss ratio [%] = (PM_LSU_DTLB_MISS_4K/(PM_LSU_DTLB_MISS_4K+PM_DTLB_MISS_64K+PM_DTLB_MISS_16M_2M+PM_DTLB_MISS_16G_1G))*100
+L1 DTLB 4K misses = PM_LSU_DTLB_MISS_4K
+L1 DTLB 4K miss rate = PM_LSU_DTLB_MISS_4K/PM_RUN_INST_CMPL
+L1 DTLB 4K miss ratio [%] = (PM_LSU_DTLB_MISS_4K/(PM_LSU_DTLB_MISS_4K+PM_DTLB_MISS_64K+PM_DTLB_MISS_16M_2M+PM_DTLB_MISS_16G_1G))*100
+-
+This group measures the data TLB misses for different page sizes.
diff --git a/collectors/likwid/groups/power9/TLB_INSTR.txt b/collectors/likwid/groups/power9/TLB_INSTR.txt
new file mode 100644
index 0000000..dc99d8a
--- /dev/null
+++ b/collectors/likwid/groups/power9/TLB_INSTR.txt
@@ -0,0 +1,21 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+PMC3  PM_ITLB_MISS
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC5/PMC4
+L1 ITLB misses     PMC3
+L1 ITLB miss rate  PMC3/PMC4
+
+LONG
+Formulas:
+L1 ITLB misses = PM_ITLB_MISS
+L1 ITLB miss rate = PM_ITLB_MISS/PM_RUN_INST_CMPL
+-
+This group measures the reloads of the instruction TLB.
+Misses to the HPT are counted once while misses in the Radix
+tree count the number of tree levels traversed.
diff --git a/collectors/likwid/groups/power9/USEFUL.txt b/collectors/likwid/groups/power9/USEFUL.txt
new file mode 100644
index 0000000..bbc20a0
--- /dev/null
+++ b/collectors/likwid/groups/power9/USEFUL.txt
@@ -0,0 +1,22 @@
+SHORT Rate of useful instructions
+
+EVENTSET
+PMC0  PM_RUN_SPURR
+PMC1  PM_INST_DISP
+PMC3  PM_RUN_PURR
+PMC4  PM_RUN_INST_CMPL
+PMC5  PM_RUN_CYC
+
+METRICS
+CPI  PMC5/PMC4
+Useful instr. rate [%] (PMC4/PMC1)*100.0
+Processor Utilization [%] (PMC0/PMC3)*100.0
+
+
+LONG
+Formulas:
+Useful instr. rate [%] = (PM_RUN_INST_CMPL/PM_INST_DISP)*100
+Processor Utilization [%] = (PM_RUN_SPURR/PM_RUN_PURR)*100
+--
+This performance group shows the overhead of speculative
+execution of instructions and the processor utilization.
diff --git a/collectors/likwid/groups/sandybridge/BRANCH.txt b/collectors/likwid/groups/sandybridge/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/collectors/likwid/groups/sandybridge/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/sandybridge/CLOCK.txt b/collectors/likwid/groups/sandybridge/CLOCK.txt
new file mode 100644
index 0000000..a888d66
--- /dev/null
+++ b/collectors/likwid/groups/sandybridge/CLOCK.txt
@@ -0,0 +1,30 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+UBOXFIX UNCORE_CLOCK
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+Uncore Clock [MHz] 1.E-06*UBOXFIX/time
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formulas:
+Power =  PWR_PKG_ENERGY / time
+Power DRAM =  PWR_DRAM_ENERGY / time
+Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
+-
+SandyBridge implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) and DRAM level.
+
diff --git a/collectors/likwid/groups/sandybridge/CYCLE_ACTIVITY.txt b/collectors/likwid/groups/sandybridge/CYCLE_ACTIVITY.txt
new file mode 100644
index 0000000..8dbfe25
--- /dev/null
+++ b/collectors/likwid/groups/sandybridge/CYCLE_ACTIVITY.txt
@@ -0,0 +1,33 @@
+SHORT Cycle Activities
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING
+PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING
+PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Cycles without execution [%] (PMC3/FIXC1)*100
+Cycles without execution due to L1D [%] (PMC2/FIXC1)*100
+Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
+
+LONG
+Formulas:
+Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
+Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
+Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
+--
+This performance group measures the cycles while waiting for data from the cache
+and memory hierarchy.
+CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on
+any execution port.
+CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is
+outstanding.
+CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is
+outstanding.
diff --git a/collectors/likwid/groups/sandybridge/CYCLE_STALLS.txt b/collectors/likwid/groups/sandybridge/CYCLE_STALLS.txt
new file mode 100644
index 0000000..d66cbb1
--- /dev/null
+++ b/collectors/likwid/groups/sandybridge/CYCLE_STALLS.txt
@@ -0,0 +1,38 @@
+SHORT Cycle Activities (Stalls)
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING
+PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING
+PMC3 CYCLE_ACTIVITY_STALLS_TOTAL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Total execution stalls PMC3
+Stalls caused by L1D misses [%] (PMC2/PMC3)*100
+Stalls caused by L2 misses [%] (PMC0/PMC3)*100
+Execution stall rate [%] (PMC3/FIXC1)*100
+Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100
+Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
+
+LONG
+Formulas:
+Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
+Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100
+--
+This performance group measures the stalls caused by data traffic in the cache
+hierarchy.
+CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls.
+CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand
+load is outstanding.
+CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand
+load is outstanding.
diff --git a/collectors/likwid/groups/sandybridge/DATA.txt b/collectors/likwid/groups/sandybridge/DATA.txt
new file mode 100644
index 0000000..967cbad
--- /dev/null
+++ b/collectors/likwid/groups/sandybridge/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOPS_RETIRED_LOADS
+PMC1  MEM_UOPS_RETIRED_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/sandybridge/DIVIDE.txt b/collectors/likwid/groups/sandybridge/DIVIDE.txt
new file mode 100644
index 0000000..504181c
--- /dev/null
+++ b/collectors/likwid/groups/sandybridge/DIVIDE.txt
@@ -0,0 +1,24 @@
+SHORT Divide unit information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ARITH_NUM_DIV
+PMC1  ARITH_FPU_DIV_ACTIVE
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Number of divide ops PMC0
+Avg. divide unit usage duration PMC1/PMC0
+
+LONG
+Formulas:
+Number of divide ops = ARITH_NUM_DIV
+Avg. divide unit usage duration = ARITH_FPU_DIV_ACTIVE/ARITH_NUM_DIV
+-
+This performance group measures the average latency of divide operations
diff --git a/collectors/likwid/groups/sandybridge/ENERGY.txt b/collectors/likwid/groups/sandybridge/ENERGY.txt
new file mode 100644
index 0000000..9898c70
--- /dev/null
+++ b/collectors/likwid/groups/sandybridge/ENERGY.txt
@@ -0,0 +1,37 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR2  PWR_PP1_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W]  PWR1/time
+Energy PP1 [J]  PWR2
+Power PP1 [W] PWR2/time
+Energy DRAM [J]  PWR3
+Power DRAM [W]  PWR3/time
+
+LONG
+Formulas:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power PP1 = PWR_PP1_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+SandyBridge implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/collectors/likwid/groups/sandybridge/FALSE_SHARE.txt b/collectors/likwid/groups/sandybridge/FALSE_SHARE.txt
new file mode 100644
index 0000000..fbec3f4
--- /dev/null
+++ b/collectors/likwid/groups/sandybridge/FALSE_SHARE.txt
@@ -0,0 +1,25 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM
+PMC2 MEM_LOAD_UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local LLC false sharing [MByte] 1.E-06*PMC0*64
+Local LLC false sharing rate PMC0/PMC2
+
+LONG
+Formulas:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory loads as reference.
diff --git a/collectors/likwid/groups/sandybridge/FLOPS_AVX.txt b/collectors/likwid/groups/sandybridge/FLOPS_AVX.txt
new file mode 100644
index 0000000..5a3f14f
--- /dev/null
+++ b/collectors/likwid/groups/sandybridge/FLOPS_AVX.txt
@@ -0,0 +1,26 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  SIMD_FP_256_PACKED_SINGLE
+PMC1  SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0)/time
+Packed DP [MFLOP/s]  1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formulas:
+Packed SP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed DP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+-
+Packed 32b AVX FLOPs rates.
+Please note that the current FLOP measurements on SandyBridge are
+potentially wrong. So you cannot trust these counters at the moment!
+
diff --git a/collectors/likwid/groups/sandybridge/FLOPS_DP.txt b/collectors/likwid/groups/sandybridge/FLOPS_DP.txt
new file mode 100644
index 0000000..91f8a86
--- /dev/null
+++ b/collectors/likwid/groups/sandybridge/FLOPS_DP.txt
@@ -0,0 +1,33 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2  SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+Vectorization ratio = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)
+-
+SSE scalar and packed double precision FLOP rates.
+Please note that the current FLOP measurements on SandyBridge are potentially
+wrong. So you cannot trust these counters at the moment!
+
diff --git a/collectors/likwid/groups/sandybridge/FLOPS_SP.txt b/collectors/likwid/groups/sandybridge/FLOPS_SP.txt
new file mode 100644
index 0000000..930a988
--- /dev/null
+++ b/collectors/likwid/groups/sandybridge/FLOPS_SP.txt
@@ -0,0 +1,33 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2  SIMD_FP_256_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
+
+LONG
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX SP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+Vectorization ratio = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)
+-
+SSE scalar and packed single precision FLOP rates.
+Please note that the current FLOP measurements on SandyBridge are potentially
+wrong. So you cannot trust these counters at the moment!
+
diff --git a/collectors/likwid/groups/sandybridge/ICACHE.txt b/collectors/likwid/groups/sandybridge/ICACHE.txt
new file mode 100644
index 0000000..f1e2335
--- /dev/null
+++ b/collectors/likwid/groups/sandybridge/ICACHE.txt
@@ -0,0 +1,33 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+PMC2  ICACHE_IFETCH_STALL
+PMC3  ILD_STALL_IQ_FULL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+L1I stalls PMC2
+L1I stall rate PMC2/FIXC0
+L1I queue full stalls PMC3
+L1I queue full stall rate PMC3/FIXC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I stalls = ICACHE_IFETCH_STALL
+L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY
+-
+This group measures some L1 instruction cache metrics.
diff --git a/collectors/likwid/groups/sandybridge/L2.txt b/collectors/likwid/groups/sandybridge/L2.txt
new file mode 100644
index 0000000..1feb44c
--- /dev/null
+++ b/collectors/likwid/groups/sandybridge/L2.txt
@@ -0,0 +1,38 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L1D_M_EVICT
+PMC2  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L1 and the number of modified cache lines
+evicted from the L1. The group also output total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
+
diff --git a/collectors/likwid/groups/sandybridge/L2CACHE.txt b/collectors/likwid/groups/sandybridge/L2CACHE.txt
new file mode 100644
index 0000000..fbc3745
--- /dev/null
+++ b/collectors/likwid/groups/sandybridge/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_TRANS_ALL_REQUESTS
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/sandybridge/L3.txt b/collectors/likwid/groups/sandybridge/L3.txt
new file mode 100644
index 0000000..f63a918
--- /dev/null
+++ b/collectors/likwid/groups/sandybridge/L3.txt
@@ -0,0 +1,36 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ALL
+PMC1  L2_TRANS_L2_WB
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and  measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/collectors/likwid/groups/sandybridge/L3CACHE.txt b/collectors/likwid/groups/sandybridge/L3CACHE.txt
new file mode 100644
index 0000000..3dbb6cc
--- /dev/null
+++ b/collectors/likwid/groups/sandybridge/L3CACHE.txt
@@ -0,0 +1,36 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0:MATCH0=0x0081:MATCH1=0x3fffc0 OFFCORE_RESPONSE_0_OPTIONS
+PMC1:MATCH0=0x0081:MATCH1=0x1 OFFCORE_RESPONSE_1_OPTIONS
+PMC2 L1D_REPLACEMENT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate PMC1/FIXC0
+L3 miss rate PMC0/FIXC0
+L3 miss ratio PMC0/PMC1
+
+LONG
+Formulas:
+L3 request rate = OFFCORE_RESPONSE_1_OPTIONS:MATCH0=0x0081:MATCH1=0x1/INSTR_RETIRED_ANY
+L3 miss rate = OFFCORE_RESPONSE_0_OPTIONS:MATCH0=0x0081:MATCH1=0x3fffc0/INSTR_RETIRED_ANY
+L3 miss ratio = OFFCORE_RESPONSE_0_OPTIONS:MATCH0=0x0081:MATCH1=0x3fffc0/OFFCORE_RESPONSE_1_OPTIONS:MATCH0=0x0081:MATCH1=0x1
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from L3 compared to all loaded cache lines in L1.
+And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/sandybridge/PORT_USAGE.txt b/collectors/likwid/groups/sandybridge/PORT_USAGE.txt
new file mode 100644
index 0000000..d509607
--- /dev/null
+++ b/collectors/likwid/groups/sandybridge/PORT_USAGE.txt
@@ -0,0 +1,40 @@
+SHORT  Execution port utilization
+
+REQUIRE_NOHT
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_DISPATCHED_PORT_PORT_0
+PMC1  UOPS_DISPATCHED_PORT_PORT_1
+PMC2  UOPS_DISPATCHED_PORT_PORT_2
+PMC3  UOPS_DISPATCHED_PORT_PORT_3
+PMC4  UOPS_DISPATCHED_PORT_PORT_4
+PMC5  UOPS_DISPATCHED_PORT_PORT_5
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Port0 usage ratio PMC0/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
+Port1 usage ratio PMC1/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
+Port2 usage ratio PMC2/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
+Port3 usage ratio PMC3/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
+Port4 usage ratio PMC4/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
+Port5 usage ratio PMC5/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
+
+LONG
+Formulas:
+Port0 usage ratio = UOPS_DISPATCHED_PORT_PORT_0/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port1 usage ratio = UOPS_DISPATCHED_PORT_PORT_1/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port2 usage ratio = UOPS_DISPATCHED_PORT_PORT_2/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port3 usage ratio = UOPS_DISPATCHED_PORT_PORT_3/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port4 usage ratio = UOPS_DISPATCHED_PORT_PORT_4/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port5 usage ratio = UOPS_DISPATCHED_PORT_PORT_5/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+-
+This group measures the execution port utilization in a CPU core. The group can
+only be measured when HyperThreading is disabled because only then each CPU core
+can program eight counters.
diff --git a/collectors/likwid/groups/sandybridge/RECOVERY.txt b/collectors/likwid/groups/sandybridge/RECOVERY.txt
new file mode 100644
index 0000000..7928ee4
--- /dev/null
+++ b/collectors/likwid/groups/sandybridge/RECOVERY.txt
@@ -0,0 +1,22 @@
+SHORT  Recovery duration
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  INT_MISC_RECOVERY_CYCLES
+PMC1  INT_MISC_RECOVERY_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Avg. recovery duration PMC0/PMC1
+
+LONG
+Formulas:
+Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT
+-
+This group measures the duration of recoveries after SSE exception, memory
+disambiguation, etc...
diff --git a/collectors/likwid/groups/sandybridge/TLB_DATA.txt b/collectors/likwid/groups/sandybridge/TLB_DATA.txt
new file mode 100644
index 0000000..8d94e05
--- /dev/null
+++ b/collectors/likwid/groups/sandybridge/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1  DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2  DTLB_LOAD_MISSES_WALK_DURATION
+PMC3  DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses     PMC1
+L1 DTLB store miss rate  PMC1/FIXC0
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/sandybridge/TLB_INSTR.txt b/collectors/likwid/groups/sandybridge/TLB_INSTR.txt
new file mode 100644
index 0000000..235d977
--- /dev/null
+++ b/collectors/likwid/groups/sandybridge/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_CAUSES_A_WALK
+PMC1  ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/sandybridge/TMA.txt b/collectors/likwid/groups/sandybridge/TMA.txt
new file mode 100644
index 0000000..afb4126
--- /dev/null
+++ b/collectors/likwid/groups/sandybridge/TMA.txt
@@ -0,0 +1,48 @@
+SHORT Top down cycle allocation
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_ANY
+PMC1 UOPS_RETIRED_RETIRE_SLOTS
+PMC2 IDQ_UOPS_NOT_DELIVERED_CORE
+PMC3 INT_MISC_RECOVERY_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+IPC FIXC0/FIXC1
+Total Slots 4*FIXC1
+Slots Retired PMC1
+Fetch Bubbles PMC2
+Recovery Bubbles 4*PMC3
+Front End [%] PMC2/(4*FIXC1)*100
+Speculation [%] (PMC0-PMC1+(4*PMC3))/(4*FIXC1)*100
+Retiring [%] PMC1/(4*FIXC1)*100
+Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100
+
+LONG
+Formulas:
+Total Slots = 4*CPU_CLK_UNHALTED_CORE
+Slots Retired = UOPS_RETIRED_RETIRE_SLOTS
+Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE
+Recovery Bubbles = 4*INT_MISC_RECOVERY_CYCLES
+Front End [%] = IDQ_UOPS_NOT_DELIVERED_CORE/(4*CPU_CLK_UNHALTED_CORE)*100
+Speculation [%] = (UOPS_ISSUED_ANY-UOPS_RETIRED_RETIRE_SLOTS+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)*100
+Retiring [%] = UOPS_RETIRED_RETIRE_SLOTS/(4*CPU_CLK_UNHALTED_CORE)*100
+Back End [%] = (1-((IDQ_UOPS_NOT_DELIVERED_CORE+UOPS_ISSUED_ANY+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)))*100
+--
+This performance group measures cycles to determine percentage of time spent in
+front end, back end, retiring and speculation. These metrics are published and
+verified by Intel. Further information:
+Webpage describing Top-Down Method and its usage in Intel vTune:
+https://software.intel.com/en-us/vtune-amplifier-help-tuning-applications-using-a-top-down-microarchitecture-analysis-method
+Paper by Yasin Ahmad:
+https://sites.google.com/site/analysismethods/yasin-pubs/TopDown-Yasin-ISPASS14.pdf?attredirects=0
+Slides by Yasin Ahmad:
+http://www.cs.technion.ac.il/~erangi/TMA_using_Linux_perf__Ahmad_Yasin.pdf
+The performance group was originally published here:
+http://perf.mvermeulen.com/2018/04/14/top-down-performance-counter-analysis-part-1-likwid/
diff --git a/collectors/likwid/groups/sandybridge/UOPS.txt b/collectors/likwid/groups/sandybridge/UOPS.txt
new file mode 100644
index 0000000..a4d35d8
--- /dev/null
+++ b/collectors/likwid/groups/sandybridge/UOPS.txt
@@ -0,0 +1,32 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_ANY
+PMC1  UOPS_EXECUTED_THREAD
+PMC2  UOPS_RETIRED_ALL
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Issued UOPs PMC0
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formulas:
+Issued UOPs = UOPS_ISSUED_ANY
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs and returns the number of uOPs which were issued
+but not executed as well as the number of uOPs which were executed but never retired.
+The executed but not retired uOPs commonly come from speculatively executed branches.
+
diff --git a/collectors/likwid/groups/sandybridge/UOPS_EXEC.txt b/collectors/likwid/groups/sandybridge/UOPS_EXEC.txt
new file mode 100644
index 0000000..7042df7
--- /dev/null
+++ b/collectors/likwid/groups/sandybridge/UOPS_EXEC.txt
@@ -0,0 +1,31 @@
+SHORT UOPs execution
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_EXECUTED_USED_CYCLES
+PMC1  UOPS_EXECUTED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_EXECUTED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the execution stage in the pipeline. Used cycles are all cycles where uOPs are
+executed while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/collectors/likwid/groups/sandybridge/UOPS_ISSUE.txt b/collectors/likwid/groups/sandybridge/UOPS_ISSUE.txt
new file mode 100644
index 0000000..9aac923
--- /dev/null
+++ b/collectors/likwid/groups/sandybridge/UOPS_ISSUE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs issueing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_USED_CYCLES
+PMC1  UOPS_ISSUED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_ISSUED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the issue stage in the pipeline. Used cycles are all cycles where uOPs are
+issued while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/collectors/likwid/groups/sandybridge/UOPS_RETIRE.txt b/collectors/likwid/groups/sandybridge/UOPS_RETIRE.txt
new file mode 100644
index 0000000..0f37585
--- /dev/null
+++ b/collectors/likwid/groups/sandybridge/UOPS_RETIRE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs retirement
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_RETIRED_USED_CYCLES
+PMC1  UOPS_RETIRED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_RETIRED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the retirement stage in the pipeline (re-order buffer). Used cycles are all
+cycles where uOPs are retired while unused cycles refer to pipeline stalls.
+Moreover, the group calculates the average stall duration in cycles.
diff --git a/collectors/likwid/groups/sandybridgeEP/BRANCH.txt b/collectors/likwid/groups/sandybridgeEP/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/sandybridgeEP/CACHES.txt b/collectors/likwid/groups/sandybridgeEP/CACHES.txt
new file mode 100644
index 0000000..345b8f4
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/CACHES.txt
@@ -0,0 +1,97 @@
+SHORT  Some data from the CBOXes
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L1D_M_EVICT
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_TRANS_L2_WB
+CBOX0C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX1C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX2C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX3C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX4C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX5C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX6C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX7C0:STATE=0x3F LLC_LOOKUP_DATA_READ
+CBOX0C1 LLC_VICTIMS_M_STATE
+CBOX1C1 LLC_VICTIMS_M_STATE
+CBOX2C1 LLC_VICTIMS_M_STATE
+CBOX3C1 LLC_VICTIMS_M_STATE
+CBOX4C1 LLC_VICTIMS_M_STATE
+CBOX5C1 LLC_VICTIMS_M_STATE
+CBOX6C1 LLC_VICTIMS_M_STATE
+CBOX7C1 LLC_VICTIMS_M_STATE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L3 to L2 load bandwidth [MBytes/s]  1.0E-06*PMC2*64.0/time
+L3 to L2 load data volume [GBytes]  1.0E-09*PMC2*64.0
+L2 to L3 evict bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time
+L2 to L3 evict data volume [GBytes]  1.0E-09*PMC3*64.0
+L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F)*64.0/time
+System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F)*64.0
+L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)*64.0/time
+L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)*64.0
+L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)*64.0/time
+L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0:STATE=0x3F+CBOX1C0:STATE=0x3F+CBOX2C0:STATE=0x3F+CBOX3C0:STATE=0x3F+CBOX4C0:STATE=0x3F+CBOX5C0:STATE=0x3F+CBOX6C0:STATE=0x3F+CBOX7C0:STATE=0x3F+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)*64.0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+
+
+LONG
+Formulas:
+L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
+L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64
+L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
+L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64
+L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
+L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
+L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time
+L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64
+L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time
+L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64
+L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F))*64/time
+System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F))*64
+L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M_STATE))*64/time
+L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M_STATE))*64
+L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F)+SUM(LLC_VICTIMS_M_STATE))*64/time
+L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ:STATE=0x3F)+SUM(LLC_VICTIMS_M_STATE))*64
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
+-
+Group to measure cache transfers between L1 and Memory. Please notice that the
+L3 to/from system metrics contain any traffic to the system (memory,
+Intel QPI, etc.) but don't seem to handle anything because commonly memory read
+bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth.
diff --git a/collectors/likwid/groups/sandybridgeEP/CLOCK.txt b/collectors/likwid/groups/sandybridgeEP/CLOCK.txt
new file mode 100644
index 0000000..a888d66
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/CLOCK.txt
@@ -0,0 +1,30 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+UBOXFIX UNCORE_CLOCK
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+Uncore Clock [MHz] 1.E-06*UBOXFIX/time
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formulas:
+Power =  PWR_PKG_ENERGY / time
+Power DRAM =  PWR_DRAM_ENERGY / time
+Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
+-
+SandyBridge implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) and DRAM level.
+
diff --git a/collectors/likwid/groups/sandybridgeEP/CYCLE_ACTIVITY.txt b/collectors/likwid/groups/sandybridgeEP/CYCLE_ACTIVITY.txt
new file mode 100644
index 0000000..8dbfe25
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/CYCLE_ACTIVITY.txt
@@ -0,0 +1,33 @@
+SHORT Cycle Activities
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING
+PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING
+PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Cycles without execution [%] (PMC3/FIXC1)*100
+Cycles without execution due to L1D [%] (PMC2/FIXC1)*100
+Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
+
+LONG
+Formulas:
+Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
+Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
+Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
+--
+This performance group measures the cycles while waiting for data from the cache
+and memory hierarchy.
+CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on
+any execution port.
+CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is
+outstanding.
+CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is
+outstanding.
diff --git a/collectors/likwid/groups/sandybridgeEP/CYCLE_STALLS.txt b/collectors/likwid/groups/sandybridgeEP/CYCLE_STALLS.txt
new file mode 100644
index 0000000..d66cbb1
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/CYCLE_STALLS.txt
@@ -0,0 +1,38 @@
+SHORT Cycle Activities (Stalls)
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING
+PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING
+PMC3 CYCLE_ACTIVITY_STALLS_TOTAL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Total execution stalls PMC3
+Stalls caused by L1D misses [%] (PMC2/PMC3)*100
+Stalls caused by L2 misses [%] (PMC0/PMC3)*100
+Execution stall rate [%] (PMC3/FIXC1)*100
+Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100
+Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
+
+LONG
+Formulas:
+Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
+Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100
+--
+This performance group measures the stalls caused by data traffic in the cache
+hierarchy.
+CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls.
+CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand
+load is outstanding.
+CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand
+load is outstanding.
diff --git a/collectors/likwid/groups/sandybridgeEP/DATA.txt b/collectors/likwid/groups/sandybridgeEP/DATA.txt
new file mode 100644
index 0000000..967cbad
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOPS_RETIRED_LOADS
+PMC1  MEM_UOPS_RETIRED_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOPS_RETIRED_LOADS/MEM_UOPS_RETIRED_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/sandybridgeEP/DIVIDE.txt b/collectors/likwid/groups/sandybridgeEP/DIVIDE.txt
new file mode 100644
index 0000000..504181c
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/DIVIDE.txt
@@ -0,0 +1,24 @@
+SHORT Divide unit information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ARITH_NUM_DIV
+PMC1  ARITH_FPU_DIV_ACTIVE
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Number of divide ops PMC0
+Avg. divide unit usage duration PMC1/PMC0
+
+LONG
+Formulas:
+Number of divide ops = ARITH_NUM_DIV
+Avg. divide unit usage duration = ARITH_FPU_DIV_ACTIVE/ARITH_NUM_DIV
+-
+This performance group measures the average latency of divide operations
diff --git a/collectors/likwid/groups/sandybridgeEP/ENERGY.txt b/collectors/likwid/groups/sandybridgeEP/ENERGY.txt
new file mode 100644
index 0000000..1ab4ef3
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/ENERGY.txt
@@ -0,0 +1,33 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W]  PWR1/time
+Energy DRAM [J]  PWR3
+Power DRAM [W]  PWR3/time
+
+LONG
+Formulas:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+SandyBridge implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/collectors/likwid/groups/sandybridgeEP/FALSE_SHARE.txt b/collectors/likwid/groups/sandybridgeEP/FALSE_SHARE.txt
new file mode 100644
index 0000000..27f568a
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/FALSE_SHARE.txt
@@ -0,0 +1,27 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM
+PMC2 MEM_LOAD_UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local LLC false sharing [MByte] 1.E-06*PMC0*64
+Local LLC false sharing rate PMC0/PMC2
+
+LONG
+Formulas:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM/MEM_LOAD_UOPS_RETIRED_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory loads as reference.
+Intel SandyBridge EP CPUs do not provide the events to measure the false-sharing
+over CPU socket boundaries.
diff --git a/collectors/likwid/groups/sandybridgeEP/FLOPS_AVX.txt b/collectors/likwid/groups/sandybridgeEP/FLOPS_AVX.txt
new file mode 100644
index 0000000..5a3f14f
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/FLOPS_AVX.txt
@@ -0,0 +1,26 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  SIMD_FP_256_PACKED_SINGLE
+PMC1  SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0)/time
+Packed DP [MFLOP/s]  1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formulas:
+Packed SP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed DP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+-
+Packed 32b AVX FLOPs rates.
+Please note that the current FLOP measurements on SandyBridge are
+potentially wrong. So you cannot trust these counters at the moment!
+
diff --git a/collectors/likwid/groups/sandybridgeEP/FLOPS_DP.txt b/collectors/likwid/groups/sandybridgeEP/FLOPS_DP.txt
new file mode 100644
index 0000000..91f8a86
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/FLOPS_DP.txt
@@ -0,0 +1,33 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2  SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+Vectorization ratio = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)
+-
+SSE scalar and packed double precision FLOP rates.
+Please note that the current FLOP measurements on SandyBridge are potentially
+wrong. So you cannot trust these counters at the moment!
+
diff --git a/collectors/likwid/groups/sandybridgeEP/FLOPS_SP.txt b/collectors/likwid/groups/sandybridgeEP/FLOPS_SP.txt
new file mode 100644
index 0000000..930a988
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/FLOPS_SP.txt
@@ -0,0 +1,33 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2  SIMD_FP_256_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
+
+LONG
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX SP [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+Vectorization ratio = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)
+-
+SSE scalar and packed single precision FLOP rates.
+Please note that the current FLOP measurements on SandyBridge are potentially
+wrong. So you cannot trust these counters at the moment!
+
diff --git a/collectors/likwid/groups/sandybridgeEP/ICACHE.txt b/collectors/likwid/groups/sandybridgeEP/ICACHE.txt
new file mode 100644
index 0000000..f1e2335
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/ICACHE.txt
@@ -0,0 +1,33 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+PMC2  ICACHE_IFETCH_STALL
+PMC3  ILD_STALL_IQ_FULL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+L1I stalls PMC2
+L1I stall rate PMC2/FIXC0
+L1I queue full stalls PMC3
+L1I queue full stall rate PMC3/FIXC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I stalls = ICACHE_IFETCH_STALL
+L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY
+-
+This group measures some L1 instruction cache metrics.
diff --git a/collectors/likwid/groups/sandybridgeEP/L2.txt b/collectors/likwid/groups/sandybridgeEP/L2.txt
new file mode 100644
index 0000000..1feb44c
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/L2.txt
@@ -0,0 +1,38 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L1D_M_EVICT
+PMC2  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L1 and the number of modified cache lines
+evicted from the L1. The group also output total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
+
diff --git a/collectors/likwid/groups/sandybridgeEP/L2CACHE.txt b/collectors/likwid/groups/sandybridgeEP/L2CACHE.txt
new file mode 100644
index 0000000..fbc3745
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_TRANS_ALL_REQUESTS
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/sandybridgeEP/L3.txt b/collectors/likwid/groups/sandybridgeEP/L3.txt
new file mode 100644
index 0000000..f63a918
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/L3.txt
@@ -0,0 +1,36 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ALL
+PMC1  L2_TRANS_L2_WB
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and  measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/collectors/likwid/groups/sandybridgeEP/L3CACHE.txt b/collectors/likwid/groups/sandybridgeEP/L3CACHE.txt
new file mode 100644
index 0000000..3dbb6cc
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/L3CACHE.txt
@@ -0,0 +1,36 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0:MATCH0=0x0081:MATCH1=0x3fffc0 OFFCORE_RESPONSE_0_OPTIONS
+PMC1:MATCH0=0x0081:MATCH1=0x1 OFFCORE_RESPONSE_1_OPTIONS
+PMC2 L1D_REPLACEMENT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate PMC1/FIXC0
+L3 miss rate PMC0/FIXC0
+L3 miss ratio PMC0/PMC1
+
+LONG
+Formulas:
+L3 request rate = OFFCORE_RESPONSE_1_OPTIONS:MATCH0=0x0081:MATCH1=0x1/INSTR_RETIRED_ANY
+L3 miss rate = OFFCORE_RESPONSE_0_OPTIONS:MATCH0=0x0081:MATCH1=0x3fffc0/INSTR_RETIRED_ANY
+L3 miss ratio = OFFCORE_RESPONSE_0_OPTIONS:MATCH0=0x0081:MATCH1=0x3fffc0/OFFCORE_RESPONSE_1_OPTIONS:MATCH0=0x0081:MATCH1=0x1
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from L3 compared to all loaded cache lines in L1.
+And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/sandybridgeEP/MEM.txt b/collectors/likwid/groups/sandybridgeEP/MEM.txt
new file mode 100644
index 0000000..0be0645
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/MEM.txt
@@ -0,0 +1,40 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0   INSTR_RETIRED_ANY
+FIXC1   CPU_CLK_UNHALTED_CORE
+FIXC2   CPU_CLK_UNHALTED_REF
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on a
+per socket base. Also outputs total data volume transferred from main memory.
+
diff --git a/collectors/likwid/groups/sandybridgeEP/MEM_DP.txt b/collectors/likwid/groups/sandybridgeEP/MEM_DP.txt
new file mode 100644
index 0000000..f2d68ba
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/MEM_DP.txt
@@ -0,0 +1,66 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2  SIMD_FP_256_PACKED_DOUBLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+MFLOP/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX [MFLOP/s] 1.0E-06*(PMC2*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+Operational intensity (PMC0*2.0+PMC1+PMC2*4.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0)
+
+LONG
+Formulas:
+Power [W] = PWR_PKG_ENERGY/runtime
+Power DRAM [W] = PWR_DRAM_ENERGY/runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+Operational intensity = (FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/((SUM(MBOXxC0)+SUM(MBOXxC1))*64.0)
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed double precision FLOP rates. Also reports on packed AVX
+32b instructions.  Please note that the current FLOP measurements on SandyBridge
+are potentially wrong. So you cannot trust these counters at the moment!
+The operational intensity is calculated using the FP values of the cores and the
+memory data volume of the whole socket. The actual operational intensity for
+multiple CPUs can be found in the statistics table in the Sum column.
diff --git a/collectors/likwid/groups/sandybridgeEP/MEM_SP.txt b/collectors/likwid/groups/sandybridgeEP/MEM_SP.txt
new file mode 100644
index 0000000..955cdc4
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/MEM_SP.txt
@@ -0,0 +1,66 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2  SIMD_FP_256_PACKED_SINGLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+MFLOP/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX [MFLOP/s] 1.0E-06*(PMC2*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+Operational intensity (PMC0*4.0+PMC1+PMC2*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0)
+
+LONG
+Formulas:
+Power [W] = PWR_PKG_ENERGY/runtime
+Power DRAM [W] = PWR_DRAM_ENERGY/runtime
+MFLOP/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX [MFLOP/s] = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+Operational intensity = (FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/((SUM(MBOXxC0)+SUM(MBOXxC1))*64.0)
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed single precision FLOP rates. Also reports on packed AVX
+32b instructions. Please note that the current FLOP measurements on SandyBridge
+are potentially wrong. So you cannot trust these counters at the moment!
+The operational intensity is calculated using the FP values of the cores and the
+memory data volume of the whole socket. The actual operational intensity for
+multiple CPUs can be found in the statistics table in the Sum column.
diff --git a/collectors/likwid/groups/sandybridgeEP/NUMA.txt b/collectors/likwid/groups/sandybridgeEP/NUMA.txt
new file mode 100644
index 0000000..41fbe62
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/NUMA.txt
@@ -0,0 +1,33 @@
+SHORT Local and remote memory accesses
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 OFFCORE_RESPONSE_0_LOCAL_DRAM
+PMC1 OFFCORE_RESPONSE_1_REMOTE_DRAM
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local DRAM data volume [GByte]  1.E-09*PMC0*64
+Local DRAM bandwidth [MByte/s]  1.E-06*(PMC0*64)/time
+Remote DRAM data volume [GByte]  1.E-09*PMC1*64
+Remote DRAM bandwidth [MByte/s]  1.E-06*(PMC1*64)/time
+Memory data volume [GByte]  1.E-09*(PMC0+PMC1)*64
+Memory bandwidth [MByte/s]  1.E-06*((PMC0+PMC1)*64)/time
+
+LONG
+Formulas:
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+Local DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_0_LOCAL_DRAM*64
+Local DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_0_LOCAL_DRAM*64)/time
+Remote DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_1_REMOTE_DRAM*64
+Remote DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_1_REMOTE_DRAM*64)/time
+Memory data volume [GByte] = 1.E-09*(OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64
+Memory bandwidth [MByte/s] = 1.E-06*((OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64)/time
+--
+This performance group measures the data traffic of CPU cores to local and remote
+memory.
diff --git a/collectors/likwid/groups/sandybridgeEP/PORT_USAGE.txt b/collectors/likwid/groups/sandybridgeEP/PORT_USAGE.txt
new file mode 100644
index 0000000..d509607
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/PORT_USAGE.txt
@@ -0,0 +1,40 @@
+SHORT  Execution port utilization
+
+REQUIRE_NOHT
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_DISPATCHED_PORT_PORT_0
+PMC1  UOPS_DISPATCHED_PORT_PORT_1
+PMC2  UOPS_DISPATCHED_PORT_PORT_2
+PMC3  UOPS_DISPATCHED_PORT_PORT_3
+PMC4  UOPS_DISPATCHED_PORT_PORT_4
+PMC5  UOPS_DISPATCHED_PORT_PORT_5
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Port0 usage ratio PMC0/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
+Port1 usage ratio PMC1/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
+Port2 usage ratio PMC2/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
+Port3 usage ratio PMC3/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
+Port4 usage ratio PMC4/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
+Port5 usage ratio PMC5/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5)
+
+LONG
+Formulas:
+Port0 usage ratio = UOPS_DISPATCHED_PORT_PORT_0/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port1 usage ratio = UOPS_DISPATCHED_PORT_PORT_1/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port2 usage ratio = UOPS_DISPATCHED_PORT_PORT_2/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port3 usage ratio = UOPS_DISPATCHED_PORT_PORT_3/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port4 usage ratio = UOPS_DISPATCHED_PORT_PORT_4/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port5 usage ratio = UOPS_DISPATCHED_PORT_PORT_5/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+-
+This group measures the execution port utilization in a CPU core. The group can
+only be measured when HyperThreading is disabled because only then each CPU core
+can program eight counters.
diff --git a/collectors/likwid/groups/sandybridgeEP/QPI.txt b/collectors/likwid/groups/sandybridgeEP/QPI.txt
new file mode 100644
index 0000000..320614f
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/QPI.txt
@@ -0,0 +1,35 @@
+SHORT QPI traffic between sockets
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+SBOX0C0 DIRECT2CORE_SUCCESS
+SBOX0C1 RXL_FLITS_G1_DRS_DATA
+SBOX0C2 RXL_FLITS_G2_NCB_DATA
+SBOX1C0 DIRECT2CORE_SUCCESS
+SBOX1C1 RXL_FLITS_G1_DRS_DATA
+SBOX1C2 RXL_FLITS_G2_NCB_DATA
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Received bandwidth from QPI [MBytes/s] 1.0E-06*(SBOX0C1+SBOX0C2+SBOX1C1+SBOX1C2)*8/time
+Received data volume from QPI [GBytes] 1.0E-09*(SBOX0C1+SBOX0C2+SBOX1C1+SBOX1C2)*8
+Bandwidth QPI to LLC [MBytes/s] 1.0E-06*(SBOX0C0+SBOX1C0)*64/time
+Data volume QPI to LLC [GBytes] 1.0E-09*(SBOX0C0+SBOX1C0)*64
+Bandwidth QPI to HA or IIO [MBytes/s] 1.0E-06*(((SBOX0C1+SBOX0C2+SBOX1C1+SBOX1C2)*8)-((SBOX0C0+SBOX1C0)*64))/time
+Data volume QPI to HA or IIO [GBytes] 1.0E-09*(((SBOX0C1+SBOX0C2+SBOX1C1+SBOX1C2)*8)-((SBOX0C0+SBOX1C0)*64))
+
+LONG
+Formulas:
+Received bandwidth from QPI [MBytes/s] = 1.0E-06*(sum(RXL_FLITS_G1_DRS_DATA)+sum(RXL_FLITS_G2_NCB_DATA))*8/time
+Received data volume from QPI [GBytes] = 1.0E-09*(sum(RXL_FLITS_G1_DRS_DATA)+sum(RXL_FLITS_G2_NCB_DATA))*8
+Bandwidth QPI to LLC [MBytes/s] = 1.0E-06*(sum(DIRECT2CORE_SUCCESS))*64/time
+Data volume QPI to LLC [GBytes] = 1.0E-09*(sum(DIRECT2CORE_SUCCESS))*64
+Bandwidth QPI to HA or IIO [MBytes/s] = 1.0E-06*(((sum(RXL_FLITS_G1_DRS_DATA)+sum(RXL_FLITS_G2_NCB_DATA))*8)-((sum(DIRECT2CORE_SUCCESS))*64))/time
+Data volume QPI to HA or IIO [GBytes] = 1.0E-09*(((sum(RXL_FLITS_G1_DRS_DATA)+sum(RXL_FLITS_G2_NCB_DATA))*8)-((sum(DIRECT2CORE_SUCCESS))*64))
+-
+Profiling group to measure traffic on the QPI.
diff --git a/collectors/likwid/groups/sandybridgeEP/RECOVERY.txt b/collectors/likwid/groups/sandybridgeEP/RECOVERY.txt
new file mode 100644
index 0000000..7928ee4
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/RECOVERY.txt
@@ -0,0 +1,22 @@
+SHORT  Recovery duration
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  INT_MISC_RECOVERY_CYCLES
+PMC1  INT_MISC_RECOVERY_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Avg. recovery duration PMC0/PMC1
+
+LONG
+Formulas:
+Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT
+-
+This group measures the duration of recoveries after SSE exception, memory
+disambiguation, etc...
diff --git a/collectors/likwid/groups/sandybridgeEP/TLB_DATA.txt b/collectors/likwid/groups/sandybridgeEP/TLB_DATA.txt
new file mode 100644
index 0000000..8d94e05
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1  DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2  DTLB_LOAD_MISSES_WALK_DURATION
+PMC3  DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses     PMC1
+L1 DTLB store miss rate  PMC1/FIXC0
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/sandybridgeEP/TLB_INSTR.txt b/collectors/likwid/groups/sandybridgeEP/TLB_INSTR.txt
new file mode 100644
index 0000000..235d977
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_CAUSES_A_WALK
+PMC1  ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/sandybridgeEP/TMA.txt b/collectors/likwid/groups/sandybridgeEP/TMA.txt
new file mode 100644
index 0000000..afb4126
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/TMA.txt
@@ -0,0 +1,48 @@
+SHORT Top down cycle allocation
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_ANY
+PMC1 UOPS_RETIRED_RETIRE_SLOTS
+PMC2 IDQ_UOPS_NOT_DELIVERED_CORE
+PMC3 INT_MISC_RECOVERY_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+IPC FIXC0/FIXC1
+Total Slots 4*FIXC1
+Slots Retired PMC1
+Fetch Bubbles PMC2
+Recovery Bubbles 4*PMC3
+Front End [%] PMC2/(4*FIXC1)*100
+Speculation [%] (PMC0-PMC1+(4*PMC3))/(4*FIXC1)*100
+Retiring [%] PMC1/(4*FIXC1)*100
+Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100
+
+LONG
+Formulas:
+Total Slots = 4*CPU_CLK_UNHALTED_CORE
+Slots Retired = UOPS_RETIRED_RETIRE_SLOTS
+Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE
+Recovery Bubbles = 4*INT_MISC_RECOVERY_CYCLES
+Front End [%] = IDQ_UOPS_NOT_DELIVERED_CORE/(4*CPU_CLK_UNHALTED_CORE)*100
+Speculation [%] = (UOPS_ISSUED_ANY-UOPS_RETIRED_RETIRE_SLOTS+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)*100
+Retiring [%] = UOPS_RETIRED_RETIRE_SLOTS/(4*CPU_CLK_UNHALTED_CORE)*100
+Back End [%] = (1-((IDQ_UOPS_NOT_DELIVERED_CORE+UOPS_ISSUED_ANY+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)))*100
+--
+This performance group measures cycles to determine percentage of time spent in
+front end, back end, retiring and speculation. These metrics are published and
+verified by Intel. Further information:
+Webpage describing Top-Down Method and its usage in Intel vTune:
+https://software.intel.com/en-us/vtune-amplifier-help-tuning-applications-using-a-top-down-microarchitecture-analysis-method
+Paper by Yasin Ahmad:
+https://sites.google.com/site/analysismethods/yasin-pubs/TopDown-Yasin-ISPASS14.pdf?attredirects=0
+Slides by Yasin Ahmad:
+http://www.cs.technion.ac.il/~erangi/TMA_using_Linux_perf__Ahmad_Yasin.pdf
+The performance group was originally published here:
+http://perf.mvermeulen.com/2018/04/14/top-down-performance-counter-analysis-part-1-likwid/
diff --git a/collectors/likwid/groups/sandybridgeEP/UOPS.txt b/collectors/likwid/groups/sandybridgeEP/UOPS.txt
new file mode 100644
index 0000000..a4d35d8
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/UOPS.txt
@@ -0,0 +1,32 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_ANY
+PMC1  UOPS_EXECUTED_THREAD
+PMC2  UOPS_RETIRED_ALL
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Issued UOPs PMC0
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formulas:
+Issued UOPs = UOPS_ISSUED_ANY
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs and returns the number of uOPs which were issued
+but not executed as well as the number of uOPs which were executed but never retired.
+The executed but not retired uOPs commonly come from speculatively executed branches.
+
diff --git a/collectors/likwid/groups/sandybridgeEP/UOPS_EXEC.txt b/collectors/likwid/groups/sandybridgeEP/UOPS_EXEC.txt
new file mode 100644
index 0000000..7042df7
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/UOPS_EXEC.txt
@@ -0,0 +1,31 @@
+SHORT UOPs execution
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_EXECUTED_USED_CYCLES
+PMC1  UOPS_EXECUTED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_EXECUTED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the execution stage in the pipeline. Used cycles are all cycles where uOPs are
+executed while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/collectors/likwid/groups/sandybridgeEP/UOPS_ISSUE.txt b/collectors/likwid/groups/sandybridgeEP/UOPS_ISSUE.txt
new file mode 100644
index 0000000..9aac923
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/UOPS_ISSUE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs issueing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_USED_CYCLES
+PMC1  UOPS_ISSUED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_ISSUED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the issue stage in the pipeline. Used cycles are all cycles where uOPs are
+issued while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/collectors/likwid/groups/sandybridgeEP/UOPS_RETIRE.txt b/collectors/likwid/groups/sandybridgeEP/UOPS_RETIRE.txt
new file mode 100644
index 0000000..0f37585
--- /dev/null
+++ b/collectors/likwid/groups/sandybridgeEP/UOPS_RETIRE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs retirement
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_RETIRED_USED_CYCLES
+PMC1  UOPS_RETIRED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_RETIRED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the retirement stage in the pipeline (re-order buffer). Used cycles are all
+cycles where uOPs are retired while unused cycles refer to pipeline stalls.
+Moreover, the group calculates the average stall duration in cycles.
diff --git a/collectors/likwid/groups/silvermont/BRANCH.txt b/collectors/likwid/groups/silvermont/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/collectors/likwid/groups/silvermont/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/silvermont/CLOCK.txt b/collectors/likwid/groups/silvermont/CLOCK.txt
new file mode 100644
index 0000000..b2174c8
--- /dev/null
+++ b/collectors/likwid/groups/silvermont/CLOCK.txt
@@ -0,0 +1,23 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+
+LONG
+Formulas:
+Power =  PWR_PKG_ENERGY / time
+-
+Silvermont implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/collectors/likwid/groups/silvermont/DATA.txt b/collectors/likwid/groups/silvermont/DATA.txt
new file mode 100644
index 0000000..61a915b
--- /dev/null
+++ b/collectors/likwid/groups/silvermont/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOPS_RETIRED_ALL_LOADS
+PMC1  MEM_UOPS_RETIRED_ALL_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOPS_RETIRED_ALL_LOADS/MEM_UOPS_RETIRED_ALL_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/silvermont/DIVIDE.txt b/collectors/likwid/groups/silvermont/DIVIDE.txt
new file mode 100644
index 0000000..f82fc59
--- /dev/null
+++ b/collectors/likwid/groups/silvermont/DIVIDE.txt
@@ -0,0 +1,24 @@
+SHORT Divide unit information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLES_DIV_BUSY_ANY
+PMC1:EDGEDETECT CYCLES_DIV_BUSY_ANY
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Number of divide ops PMC1:EDGEDETECT
+Avg. divide unit usage duration PMC0/PMC1:EDGEDETECT
+
+LONG
+Formulas:
+Number of divide ops = CYCLES_DIV_BUSY_ANY:EDGEDETECT
+Avg. divide unit usage duration = CYCLES_DIV_BUSY_ANY/CYCLES_DIV_BUSY_ANY:EDGEDETECT
+-
+This performance group measures the average latency of divide operations
diff --git a/collectors/likwid/groups/silvermont/ENERGY.txt b/collectors/likwid/groups/silvermont/ENERGY.txt
new file mode 100644
index 0000000..73939a3
--- /dev/null
+++ b/collectors/likwid/groups/silvermont/ENERGY.txt
@@ -0,0 +1,29 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
+
+LONG
+Formulas:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+-
+Silvermont implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/collectors/likwid/groups/silvermont/ICACHE.txt b/collectors/likwid/groups/silvermont/ICACHE.txt
new file mode 100644
index 0000000..5f11ad6
--- /dev/null
+++ b/collectors/likwid/groups/silvermont/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+-
+This group measures some L1 instruction cache metrics.
diff --git a/collectors/likwid/groups/silvermont/L2CACHE.txt b/collectors/likwid/groups/silvermont/L2CACHE.txt
new file mode 100644
index 0000000..32a1545
--- /dev/null
+++ b/collectors/likwid/groups/silvermont/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  LONGEST_LAT_CACHE_REFERENCE
+PMC1  LONGEST_LAT_CACHE_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = LONGEST_LAT_CACHE_REFERENCE/INSTR_RETIRED_ANY
+L2 miss rate = LONGEST_LAT_CACHE_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = LONGEST_LAT_CACHE_MISS/LONGEST_LAT_CACHE_REFERENCE
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache
+reuse.
+
diff --git a/collectors/likwid/groups/silvermont/MEM.txt b/collectors/likwid/groups/silvermont/MEM.txt
new file mode 100644
index 0000000..de78337
--- /dev/null
+++ b/collectors/likwid/groups/silvermont/MEM.txt
@@ -0,0 +1,37 @@
+SHORT Memory load bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  LONGEST_LAT_CACHE_MISS
+PMC1  OFFCORE_RESPONSE_1_WB_ANY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(PMC0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(PMC0)*64.0
+Memory writeback bandwidth [MBytes/s] 1.0E-06*(PMC1)*64.0/time
+Memory writeback data volume [GBytes] 1.0E-09*(PMC1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(LONGEST_LAT_CACHE_MISS)*64/time
+Memory read data volume [GBytes] = 1.0E-09*(LONGEST_LAT_CACHE_MISS)*64
+Memory writeback bandwidth [MBytes/s] = 1.0E-06*(OFFCORE_RESPONSE_1_WB_ANY)*64/time
+Memory writeback data volume [GBytes] = 1.0E-09*(OFFCORE_RESPONSE_1_WB_ANY)*64
+Memory bandwidth [MBytes/s] = 1.0E-06*(LONGEST_LAT_CACHE_MISS+OFFCORE_RESPONSE_1_WB_ANY)*64/time
+Memory data volume [GBytes] = 1.0E-09*(LONGEST_LAT_CACHE_MISS+OFFCORE_RESPONSE_1_WB_ANY)*64
+-
+Profiling group to measure L2 to MEM load cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 cache. Since there is no possibility to retrieve
+the evicted cache lines, this group measures only the load cache bandwidth. The
+writeback metrics count only modified cache lines that are written back to go to
+exclusive state
+The group also output totally load and writeback data volume transferred between memory and L2.
+
diff --git a/collectors/likwid/groups/silvermont/TLB_DATA.txt b/collectors/likwid/groups/silvermont/TLB_DATA.txt
new file mode 100644
index 0000000..5f2617f
--- /dev/null
+++ b/collectors/likwid/groups/silvermont/TLB_DATA.txt
@@ -0,0 +1,27 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  PAGE_WALKS_DTLB_COUNT
+PMC1  PAGE_WALKS_DTLB_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB misses     PMC0
+L1 DTLB miss rate  PMC0/FIXC0
+L1 DTLB miss duration [Cyc] PMC1/PMC0
+
+LONG
+Formulas:
+L1 DTLB misses = PAGE_WALKS_DTLB_COUNT
+L1 DTLB miss rate = PAGE_WALKS_DTLB_COUNT / INSTR_RETIRED_ANY
+L1 DTLB miss duration [Cyc] = PAGE_WALKS_DTLB_CYCLES / PAGE_WALKS_DTLB_COUNT
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/silvermont/TLB_INSTR.txt b/collectors/likwid/groups/silvermont/TLB_INSTR.txt
new file mode 100644
index 0000000..f3dd3ec
--- /dev/null
+++ b/collectors/likwid/groups/silvermont/TLB_INSTR.txt
@@ -0,0 +1,27 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  PAGE_WALKS_ITLB_COUNT
+PMC1  PAGE_WALKS_ITLB_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = PAGE_WALKS_ITLB_COUNT
+L1 ITLB miss rate = PAGE_WALKS_ITLB_COUNT / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = PAGE_WALKS_ITLB_CYCLES / PAGE_WALKS_ITLB_COUNT
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
diff --git a/collectors/likwid/groups/skylake/BRANCH.txt b/collectors/likwid/groups/skylake/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/collectors/likwid/groups/skylake/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/skylake/CLOCK.txt b/collectors/likwid/groups/skylake/CLOCK.txt
new file mode 100644
index 0000000..d682e3a
--- /dev/null
+++ b/collectors/likwid/groups/skylake/CLOCK.txt
@@ -0,0 +1,30 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+UBOXFIX UNCORE_CLOCK
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+Uncore Clock [MHz] 1.E-06*UBOXFIX/time
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formulas:
+Power =  PWR_PKG_ENERGY / time
+Power DRAM =  PWR_DRAM_ENERGY / time
+Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
+-
+Skylake implements the RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) and DRAM level.
+
diff --git a/collectors/likwid/groups/skylake/CYCLE_ACTIVITY.txt b/collectors/likwid/groups/skylake/CYCLE_ACTIVITY.txt
new file mode 100644
index 0000000..c432a44
--- /dev/null
+++ b/collectors/likwid/groups/skylake/CYCLE_ACTIVITY.txt
@@ -0,0 +1,38 @@
+SHORT Cycle Activities
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING
+PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING
+PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING
+PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Cycles without execution [%] (PMC3/FIXC1)*100
+Cycles without execution due to L1D [%] (PMC2/FIXC1)*100
+Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
+Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100
+
+LONG
+Formulas:
+Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
+Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
+Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
+Cycles without execution due to memory loads [%] = CYCLE_ACTIVITY_CYCLES_LDM_PENDING/CPU_CLK_UNHALTED_CORE*100
+--
+This performance group measures the cycles while waiting for data from the cache
+and memory hierarchy.
+CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on
+any execution port.
+CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is
+outstanding.
+CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is
+outstanding.
+CYCLE_ACTIVITY_CYCLES_LDM_PENDING: Cycles while memory subsystem has an
+outstanding load.
diff --git a/collectors/likwid/groups/skylake/CYCLE_STALLS.txt b/collectors/likwid/groups/skylake/CYCLE_STALLS.txt
new file mode 100644
index 0000000..795aeb9
--- /dev/null
+++ b/collectors/likwid/groups/skylake/CYCLE_STALLS.txt
@@ -0,0 +1,45 @@
+SHORT Cycle Activities (Stalls)
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING
+PMC1 CYCLE_ACTIVITY_STALLS_LDM_PENDING
+PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING
+PMC3 CYCLE_ACTIVITY_STALLS_TOTAL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Total execution stalls PMC3
+Stalls caused by L1D misses [%] (PMC2/PMC3)*100
+Stalls caused by L2 misses [%] (PMC0/PMC3)*100
+Stalls caused by memory loads [%] (PMC1/PMC3)*100
+Execution stall rate [%] (PMC3/FIXC1)*100
+Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100
+Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
+Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100
+
+LONG
+Formulas:
+Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
+Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Stalls caused by memory loads [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by memory loads rate [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CPU_CLK_UNHALTED_CORE)*100
+--
+This performance group measures the stalls caused by data traffic in the cache
+hierarchy.
+CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls.
+CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand
+load is outstanding.
+CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand
+load is outstanding.
+CYCLE_ACTIVITY_STALLS_LDM_PENDING: Execution stalls while memory subsystem has
+an outstanding load.
diff --git a/collectors/likwid/groups/skylake/DATA.txt b/collectors/likwid/groups/skylake/DATA.txt
new file mode 100644
index 0000000..4e6e938
--- /dev/null
+++ b/collectors/likwid/groups/skylake/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_INST_RETIRED_ALL_LOADS
+PMC1  MEM_INST_RETIRED_ALL_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_INST_RETIRED_ALL_LOADS/MEM_INST_RETIRED_ALL_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/skylake/DIVIDE.txt b/collectors/likwid/groups/skylake/DIVIDE.txt
new file mode 100644
index 0000000..40b4ab6
--- /dev/null
+++ b/collectors/likwid/groups/skylake/DIVIDE.txt
@@ -0,0 +1,24 @@
+SHORT Divide unit information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ARITH_DIVIDER_COUNT
+PMC1  ARITH_DIVIDER_ACTIVE
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Number of divide ops PMC0
+Avg. divide unit usage duration PMC1/PMC0
+
+LONG
+Formulas:
+Number of divide ops = ARITH_DIVIDER_COUNT
+Avg. divide unit usage duration = ARITH_DIVIDER_ACTIVE/ARITH_DIVIDER_COUNT
+-
+This performance group measures the average latency of divide operations
diff --git a/collectors/likwid/groups/skylake/ENERGY.txt b/collectors/likwid/groups/skylake/ENERGY.txt
new file mode 100644
index 0000000..07dbda5
--- /dev/null
+++ b/collectors/likwid/groups/skylake/ENERGY.txt
@@ -0,0 +1,39 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR2  PWR_PP1_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
+Energy PP1 [J]  PWR2
+Power PP1 [W] PWR2/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formulas:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power PP1 = PWR_PP1_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+Skylake implements the RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket)  and DRAM level.
+
diff --git a/collectors/likwid/groups/skylake/FALSE_SHARE.txt b/collectors/likwid/groups/skylake/FALSE_SHARE.txt
new file mode 100644
index 0000000..65ff4d4
--- /dev/null
+++ b/collectors/likwid/groups/skylake/FALSE_SHARE.txt
@@ -0,0 +1,25 @@
+SHORT False sharing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 MEM_LOAD_L3_HIT_RETIRED_XSNP_HITM
+PMC2 MEM_INST_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local LLC false sharing [MByte] 1.E-06*PMC0*64
+Local LLC false sharing rate PMC0/PMC2
+
+LONG
+Formulas:
+Local LLC false sharing [MByte] = 1.E-06*MEM_LOAD_L3_HIT_RETIRED_XSNP_HITM*64
+Local LLC false sharing rate = MEM_LOAD_L3_HIT_RETIRED_XSNP_HITM/MEM_INST_RETIRED_ALL
+-
+False-sharing of cache lines can dramatically reduce the performance of an
+application. This performance group measures the L3 traffic induced by false-sharing.
+The false-sharing rate uses all memory loads as reference.
diff --git a/collectors/likwid/groups/skylake/FLOPS_AVX.txt b/collectors/likwid/groups/skylake/FLOPS_AVX.txt
new file mode 100644
index 0000000..ebde747
--- /dev/null
+++ b/collectors/likwid/groups/skylake/FLOPS_AVX.txt
@@ -0,0 +1,24 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0)/time
+Packed DP [MFLOP/s]  1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formulas:
+Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+-
+Packed 32b AVX FLOPs rates.
+
diff --git a/collectors/likwid/groups/skylake/FLOPS_DP.txt b/collectors/likwid/groups/skylake/FLOPS_DP.txt
new file mode 100644
index 0000000..ff7a833
--- /dev/null
+++ b/collectors/likwid/groups/skylake/FLOPS_DP.txt
@@ -0,0 +1,31 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)
+-
+SSE scalar and packed double precision FLOP rates.
+
diff --git a/collectors/likwid/groups/skylake/FLOPS_SP.txt b/collectors/likwid/groups/skylake/FLOPS_SP.txt
new file mode 100644
index 0000000..3a7d56b
--- /dev/null
+++ b/collectors/likwid/groups/skylake/FLOPS_SP.txt
@@ -0,0 +1,31 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2)
+
+LONG
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)
+-
+SSE scalar and packed single precision FLOP rates.
+
diff --git a/collectors/likwid/groups/skylake/ICACHE.txt b/collectors/likwid/groups/skylake/ICACHE.txt
new file mode 100644
index 0000000..aab7dac
--- /dev/null
+++ b/collectors/likwid/groups/skylake/ICACHE.txt
@@ -0,0 +1,30 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_64B_IFTAG_ALL
+PMC1  ICACHE_64B_IFTAG_MISS
+PMC2  ICACHE_64B_IFTAG_STALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+L1I stalls PMC2
+L1I stall rate PMC2/FIXC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I stalls = ICACHE_IFETCH_STALL
+L1I stall rate = ICACHE_IFETCH_STALL / INSTR_RETIRED_ANY
+-
+This group measures some L1 instruction cache metrics.
diff --git a/collectors/likwid/groups/skylake/L2.txt b/collectors/likwid/groups/skylake/L2.txt
new file mode 100644
index 0000000..1a92a95
--- /dev/null
+++ b/collectors/likwid/groups/skylake/L2.txt
@@ -0,0 +1,38 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L1D_M_EVICT
+PMC2  ICACHE_64B_IFTAG_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_64B_IFTAG_MISS)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_64B_IFTAG_MISS)*64
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L1 and the number of modified cache lines
+evicted from the L1. The group also output total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
+
diff --git a/collectors/likwid/groups/skylake/L2CACHE.txt b/collectors/likwid/groups/skylake/L2CACHE.txt
new file mode 100644
index 0000000..fbc3745
--- /dev/null
+++ b/collectors/likwid/groups/skylake/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_TRANS_ALL_REQUESTS
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/skylake/L3.txt b/collectors/likwid/groups/skylake/L3.txt
new file mode 100644
index 0000000..f63a918
--- /dev/null
+++ b/collectors/likwid/groups/skylake/L3.txt
@@ -0,0 +1,36 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ALL
+PMC1  L2_TRANS_L2_WB
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and  measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/collectors/likwid/groups/skylake/L3CACHE.txt b/collectors/likwid/groups/skylake/L3CACHE.txt
new file mode 100644
index 0000000..94953ef
--- /dev/null
+++ b/collectors/likwid/groups/skylake/L3CACHE.txt
@@ -0,0 +1,35 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_LOAD_RETIRED_L3_HIT
+PMC1  MEM_LOAD_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate (PMC0+PMC1)/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/(PMC0+PMC1)
+
+LONG
+Formulas:
+L3 request rate = (MEM_LOAD_RETIRED_L3_HIT+MEM_LOAD_RETIRED_L3_MISS)/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_RETIRED_L3_MISS/(MEM_LOAD_RETIRED_L3_HIT+MEM_LOAD_RETIRED_L3_MISS)
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/skylake/MEM.txt b/collectors/likwid/groups/skylake/MEM.txt
new file mode 100644
index 0000000..3a12df7
--- /dev/null
+++ b/collectors/likwid/groups/skylake/MEM.txt
@@ -0,0 +1,36 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+MBOX0C1  DRAM_READS
+MBOX0C2  DRAM_WRITES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory load bandwidth [MBytes/s]  1.0E-06*MBOX0C1*64.0/time
+Memory load data volume [GBytes]  1.0E-09*MBOX0C1*64.0
+Memory evict bandwidth [MBytes/s]  1.0E-06*MBOX0C2*64.0/time
+Memory evict data volume [GBytes]  1.0E-09*MBOX0C2*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX0C2)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX0C2)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. This group also output data volume transferred between the
+L3 and  measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/collectors/likwid/groups/skylake/MEM_DP.txt b/collectors/likwid/groups/skylake/MEM_DP.txt
new file mode 100644
index 0000000..14a359a
--- /dev/null
+++ b/collectors/likwid/groups/skylake/MEM_DP.txt
@@ -0,0 +1,59 @@
+SHORT  Overview of arithmetic and main memory performance
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+MBOX0C1  DRAM_READS
+MBOX0C2  DRAM_WRITES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Memory load bandwidth [MBytes/s]  1.0E-06*MBOX0C1*64.0/time
+Memory load data volume [GBytes]  1.0E-09*MBOX0C1*64.0
+Memory evict bandwidth [MBytes/s]  1.0E-06*MBOX0C2*64.0/time
+Memory evict data volume [GBytes]  1.0E-09*MBOX0C2*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX0C2)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX0C2)*64.0
+Operational intensity (PMC0*2.0+PMC1+PMC2*4.0)/((MBOX0C1+MBOX0C2)*64.0)
+
+LONG
+Formulas:
+Power [W] = PWR_PKG_ENERGY/runtime
+Power DRAM [W] = PWR_DRAM_ENERGY/runtime
+DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*DRAM_READS*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*DRAM_READS*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*DRAM_WRITES*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*DRAM_WRITES*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(DRAM_READS+DRAM_WRITES)*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(DRAM_READS+DRAM_WRITES)*64.0
+Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/(DRAM_READS+DRAM_WRITES)*64.0)
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed double precision FLOP rates. Also reports on packed AVX
+32b instructions.
+The operational intensity is calculated using the FP values of the cores and the
+memory data volume of the whole socket. The actual operational intensity for
+multiple CPUs can be found in the statistics table in the Sum column.
diff --git a/collectors/likwid/groups/skylake/MEM_SP.txt b/collectors/likwid/groups/skylake/MEM_SP.txt
new file mode 100644
index 0000000..0b47052
--- /dev/null
+++ b/collectors/likwid/groups/skylake/MEM_SP.txt
@@ -0,0 +1,59 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+MBOX0C1  DRAM_READS
+MBOX0C2  DRAM_WRITES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Memory load bandwidth [MBytes/s]  1.0E-06*MBOX0C1*64.0/time
+Memory load data volume [GBytes]  1.0E-09*MBOX0C1*64.0
+Memory evict bandwidth [MBytes/s]  1.0E-06*MBOX0C2*64.0/time
+Memory evict data volume [GBytes]  1.0E-09*MBOX0C2*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX0C2)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX0C2)*64.0
+Operational intensity (PMC0*4.0+PMC1+PMC2*8.0)/((MBOX0C1+MBOX0C2)*64.0)
+
+LONG
+Formulas:
+Power [W] = PWR_PKG_ENERGY/runtime
+Power DRAM [W] = PWR_DRAM_ENERGY/runtime
+DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*8)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*DRAM_READS*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*DRAM_READS*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*DRAM_WRITES*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*DRAM_WRITES*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(DRAM_READS+DRAM_WRITES)*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(DRAM_READS+DRAM_WRITES)*64.0
+Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*8)/(DRAM_READS+DRAM_WRITES)*64.0)
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed single precision FLOP rates. Also reports on packed AVX
+32b instructions.
+The operational intensity is calculated using the FP values of the cores and the
+memory data volume of the whole socket. The actual operational intensity for
+multiple CPUs can be found in the statistics table in the Sum column.
diff --git a/collectors/likwid/groups/skylake/PORT_USAGE.txt b/collectors/likwid/groups/skylake/PORT_USAGE.txt
new file mode 100644
index 0000000..eca8f2a
--- /dev/null
+++ b/collectors/likwid/groups/skylake/PORT_USAGE.txt
@@ -0,0 +1,46 @@
+SHORT  Execution port utilization
+
+REQUIRE_NOHT
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_DISPATCHED_PORT_PORT_0
+PMC1  UOPS_DISPATCHED_PORT_PORT_1
+PMC2  UOPS_DISPATCHED_PORT_PORT_2
+PMC3  UOPS_DISPATCHED_PORT_PORT_3
+PMC4  UOPS_DISPATCHED_PORT_PORT_4
+PMC5  UOPS_DISPATCHED_PORT_PORT_5
+PMC6  UOPS_DISPATCHED_PORT_PORT_6
+PMC7  UOPS_DISPATCHED_PORT_PORT_7
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Port0 usage ratio PMC0/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port1 usage ratio PMC1/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port2 usage ratio PMC2/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port3 usage ratio PMC3/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port4 usage ratio PMC4/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port5 usage ratio PMC5/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port6 usage ratio PMC6/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+Port7 usage ratio PMC7/(PMC0+PMC1+PMC2+PMC3+PMC4+PMC5+PMC6+PMC7)
+
+LONG
+Formulas:
+Port0 usage ratio = UOPS_DISPATCHED_PORT_PORT_0/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port1 usage ratio = UOPS_DISPATCHED_PORT_PORT_1/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port2 usage ratio = UOPS_DISPATCHED_PORT_PORT_2/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port3 usage ratio = UOPS_DISPATCHED_PORT_PORT_3/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port4 usage ratio = UOPS_DISPATCHED_PORT_PORT_4/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port5 usage ratio = UOPS_DISPATCHED_PORT_PORT_5/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port6 usage ratio = UOPS_DISPATCHED_PORT_PORT_6/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+Port7 usage ratio = UOPS_DISPATCHED_PORT_PORT_7/SUM(UOPS_DISPATCHED_PORT_PORT_*)
+-
+This group measures the execution port utilization in a CPU core. The group can
+only be measured when HyperThreading is disabled because only then each CPU core
+can program eight counters.
diff --git a/collectors/likwid/groups/skylake/RECOVERY.txt b/collectors/likwid/groups/skylake/RECOVERY.txt
new file mode 100644
index 0000000..7928ee4
--- /dev/null
+++ b/collectors/likwid/groups/skylake/RECOVERY.txt
@@ -0,0 +1,22 @@
+SHORT  Recovery duration
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  INT_MISC_RECOVERY_CYCLES
+PMC1  INT_MISC_RECOVERY_COUNT
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Avg. recovery duration PMC0/PMC1
+
+LONG
+Formulas:
+Avg. recovery duration = INT_MISC_RECOVERY_CYCLES/INT_MISC_RECOVERY_COUNT
+-
+This group measures the duration of recoveries after SSE exception, memory
+disambiguation, etc...
diff --git a/collectors/likwid/groups/skylake/TLB_DATA.txt b/collectors/likwid/groups/skylake/TLB_DATA.txt
new file mode 100644
index 0000000..10ee5e1
--- /dev/null
+++ b/collectors/likwid/groups/skylake/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1  DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2  DTLB_LOAD_MISSES_WALK_ACTIVE
+PMC3  DTLB_STORE_MISSES_WALK_ACTIVE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses     PMC1
+L1 DTLB store miss rate  PMC1/FIXC0
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_ACTIVE / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_ACTIVE / DTLB_STORE_MISSES_CAUSES_A_WALK
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/skylake/TLB_INSTR.txt b/collectors/likwid/groups/skylake/TLB_INSTR.txt
new file mode 100644
index 0000000..9bc65a7
--- /dev/null
+++ b/collectors/likwid/groups/skylake/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_CAUSES_A_WALK
+PMC1  ITLB_MISSES_WALK_ACTIVE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_ACTIVE / ITLB_MISSES_CAUSES_A_WALK
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/skylake/TMA.txt b/collectors/likwid/groups/skylake/TMA.txt
new file mode 100644
index 0000000..afb4126
--- /dev/null
+++ b/collectors/likwid/groups/skylake/TMA.txt
@@ -0,0 +1,48 @@
+SHORT Top down cycle allocation
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_ANY
+PMC1 UOPS_RETIRED_RETIRE_SLOTS
+PMC2 IDQ_UOPS_NOT_DELIVERED_CORE
+PMC3 INT_MISC_RECOVERY_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+IPC FIXC0/FIXC1
+Total Slots 4*FIXC1
+Slots Retired PMC1
+Fetch Bubbles PMC2
+Recovery Bubbles 4*PMC3
+Front End [%] PMC2/(4*FIXC1)*100
+Speculation [%] (PMC0-PMC1+(4*PMC3))/(4*FIXC1)*100
+Retiring [%] PMC1/(4*FIXC1)*100
+Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100
+
+LONG
+Formulas:
+Total Slots = 4*CPU_CLK_UNHALTED_CORE
+Slots Retired = UOPS_RETIRED_RETIRE_SLOTS
+Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE
+Recovery Bubbles = 4*INT_MISC_RECOVERY_CYCLES
+Front End [%] = IDQ_UOPS_NOT_DELIVERED_CORE/(4*CPU_CLK_UNHALTED_CORE)*100
+Speculation [%] = (UOPS_ISSUED_ANY-UOPS_RETIRED_RETIRE_SLOTS+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)*100
+Retiring [%] = UOPS_RETIRED_RETIRE_SLOTS/(4*CPU_CLK_UNHALTED_CORE)*100
+Back End [%] = (1-((IDQ_UOPS_NOT_DELIVERED_CORE+UOPS_ISSUED_ANY+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)))*100
+--
+This performance group measures cycles to determine percentage of time spent in
+front end, back end, retiring and speculation. These metrics are published and
+verified by Intel. Further information:
+Webpage describing Top-Down Method and its usage in Intel vTune:
+https://software.intel.com/en-us/vtune-amplifier-help-tuning-applications-using-a-top-down-microarchitecture-analysis-method
+Paper by Yasin Ahmad:
+https://sites.google.com/site/analysismethods/yasin-pubs/TopDown-Yasin-ISPASS14.pdf?attredirects=0
+Slides by Yasin Ahmad:
+http://www.cs.technion.ac.il/~erangi/TMA_using_Linux_perf__Ahmad_Yasin.pdf
+The performance group was originally published here:
+http://perf.mvermeulen.com/2018/04/14/top-down-performance-counter-analysis-part-1-likwid/
diff --git a/collectors/likwid/groups/skylake/UOPS.txt b/collectors/likwid/groups/skylake/UOPS.txt
new file mode 100644
index 0000000..c0a86f2
--- /dev/null
+++ b/collectors/likwid/groups/skylake/UOPS.txt
@@ -0,0 +1,29 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_ANY
+PMC1  UOPS_EXECUTED_THREAD
+PMC2  UOPS_RETIRED_ALL
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Issued UOPs PMC0
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formulas:
+Issued UOPs = UOPS_ISSUED_ANY
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ALL
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs.
diff --git a/collectors/likwid/groups/skylake/UOPS_EXEC.txt b/collectors/likwid/groups/skylake/UOPS_EXEC.txt
new file mode 100644
index 0000000..7042df7
--- /dev/null
+++ b/collectors/likwid/groups/skylake/UOPS_EXEC.txt
@@ -0,0 +1,31 @@
+SHORT UOPs execution
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_EXECUTED_USED_CYCLES
+PMC1  UOPS_EXECUTED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_EXECUTED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the execution stage in the pipeline. Used cycles are all cycles where uOPs are
+executed while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/collectors/likwid/groups/skylake/UOPS_ISSUE.txt b/collectors/likwid/groups/skylake/UOPS_ISSUE.txt
new file mode 100644
index 0000000..9aac923
--- /dev/null
+++ b/collectors/likwid/groups/skylake/UOPS_ISSUE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs issueing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_USED_CYCLES
+PMC1  UOPS_ISSUED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_ISSUED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the issue stage in the pipeline. Used cycles are all cycles where uOPs are
+issued while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/collectors/likwid/groups/skylake/UOPS_RETIRE.txt b/collectors/likwid/groups/skylake/UOPS_RETIRE.txt
new file mode 100644
index 0000000..0f37585
--- /dev/null
+++ b/collectors/likwid/groups/skylake/UOPS_RETIRE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs retirement
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_RETIRED_USED_CYCLES
+PMC1  UOPS_RETIRED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_RETIRED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the retirement stage in the pipeline (re-order buffer). Used cycles are all
+cycles where uOPs are retired while unused cycles refer to pipeline stalls.
+Moreover, the group calculates the average stall duration in cycles.
diff --git a/collectors/likwid/groups/skylakeX/BRANCH.txt b/collectors/likwid/groups/skylakeX/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/collectors/likwid/groups/skylakeX/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/skylakeX/CACHES.txt b/collectors/likwid/groups/skylakeX/CACHES.txt
new file mode 100644
index 0000000..c700dd4
--- /dev/null
+++ b/collectors/likwid/groups/skylakeX/CACHES.txt
@@ -0,0 +1,143 @@
+SHORT Cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L1D_M_EVICT
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_TRANS_L2_WB
+CBOX0C1 LLC_VICTIMS_M_STATE
+CBOX1C1 LLC_VICTIMS_M_STATE
+CBOX2C1 LLC_VICTIMS_M_STATE
+CBOX3C1 LLC_VICTIMS_M_STATE
+CBOX4C1 LLC_VICTIMS_M_STATE
+CBOX5C1 LLC_VICTIMS_M_STATE
+CBOX6C1 LLC_VICTIMS_M_STATE
+CBOX7C1 LLC_VICTIMS_M_STATE
+CBOX8C1 LLC_VICTIMS_M_STATE
+CBOX9C1 LLC_VICTIMS_M_STATE
+CBOX10C1 LLC_VICTIMS_M_STATE
+CBOX11C1 LLC_VICTIMS_M_STATE
+CBOX12C1 LLC_VICTIMS_M_STATE
+CBOX13C1 LLC_VICTIMS_M_STATE
+CBOX14C1 LLC_VICTIMS_M_STATE
+CBOX15C1 LLC_VICTIMS_M_STATE
+CBOX16C1 LLC_VICTIMS_M_STATE
+CBOX17C1 LLC_VICTIMS_M_STATE
+CBOX18C1 LLC_VICTIMS_M_STATE
+CBOX19C1 LLC_VICTIMS_M_STATE
+CBOX20C1 LLC_VICTIMS_M_STATE
+CBOX21C1 LLC_VICTIMS_M_STATE
+CBOX22C1 LLC_VICTIMS_M_STATE
+CBOX23C1 LLC_VICTIMS_M_STATE
+CBOX24C1 LLC_VICTIMS_M_STATE
+CBOX25C1 LLC_VICTIMS_M_STATE
+CBOX26C1 LLC_VICTIMS_M_STATE
+CBOX27C1 LLC_VICTIMS_M_STATE
+CBOX0C0 LLC_LOOKUP_DATA_READ
+CBOX1C0 LLC_LOOKUP_DATA_READ
+CBOX2C0 LLC_LOOKUP_DATA_READ
+CBOX3C0 LLC_LOOKUP_DATA_READ
+CBOX4C0 LLC_LOOKUP_DATA_READ
+CBOX5C0 LLC_LOOKUP_DATA_READ
+CBOX6C0 LLC_LOOKUP_DATA_READ
+CBOX7C0 LLC_LOOKUP_DATA_READ
+CBOX8C0 LLC_LOOKUP_DATA_READ
+CBOX9C0 LLC_LOOKUP_DATA_READ
+CBOX10C0 LLC_LOOKUP_DATA_READ
+CBOX11C0 LLC_LOOKUP_DATA_READ
+CBOX12C0 LLC_LOOKUP_DATA_READ
+CBOX13C0 LLC_LOOKUP_DATA_READ
+CBOX14C0 LLC_LOOKUP_DATA_READ
+CBOX15C0 LLC_LOOKUP_DATA_READ
+CBOX16C0 LLC_LOOKUP_DATA_READ
+CBOX17C0 LLC_LOOKUP_DATA_READ
+CBOX18C0 LLC_LOOKUP_DATA_READ
+CBOX19C0 LLC_LOOKUP_DATA_READ
+CBOX20C0 LLC_LOOKUP_DATA_READ
+CBOX21C0 LLC_LOOKUP_DATA_READ
+CBOX22C0 LLC_LOOKUP_DATA_READ
+CBOX23C0 LLC_LOOKUP_DATA_READ
+CBOX24C0 LLC_LOOKUP_DATA_READ
+CBOX25C0 LLC_LOOKUP_DATA_READ
+CBOX26C0 LLC_LOOKUP_DATA_READ
+CBOX27C0 LLC_LOOKUP_DATA_READ
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0
+L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
+L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0
+L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L3 to L2 load bandwidth [MBytes/s]  1.0E-06*PMC2*64.0/time
+L3 to L2 load data volume [GBytes]  1.0E-09*PMC2*64.0
+L2 to L3 evict bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time
+L2 to L3 evict data volume [GBytes]  1.0E-09*PMC3*64.0
+L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0)*64.0/time
+System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0)*64.0
+L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64/time
+L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64
+L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64.0/time
+L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX22C0+CBOX23C0+CBOX24C0+CBOX25C0+CBOX26C0+CBOX27C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1+CBOX22C1+CBOX23C1+CBOX24C1+CBOX25C1+CBOX26C1+CBOX27C1)*64.0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
+
+LONG
+Formulas:
+L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
+L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64
+L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
+L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64
+L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
+L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
+L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time
+L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64
+L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time
+L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64
+L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ))*64/time
+System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ))*64
+L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M_STATE))*64/time
+L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M_STATE))*64
+L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M_STATE))*64/time
+L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M_STATE))*64
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
+-
+Group to measure cache transfers between L1 and Memory. Please notice that the
+L3 to/from system metrics contain any traffic to the system (memory,
+Intel QPI, etc.) but don't seem to handle anything because commonly memory read
+bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth.
+
diff --git a/collectors/likwid/groups/skylakeX/CLOCK.txt b/collectors/likwid/groups/skylakeX/CLOCK.txt
new file mode 100644
index 0000000..b81bee6
--- /dev/null
+++ b/collectors/likwid/groups/skylakeX/CLOCK.txt
@@ -0,0 +1,26 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+UBOXFIX UNCORE_CLOCK
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+Uncore Clock [MHz] 1.E-06*UBOXFIX/time
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+
+LONG
+Formulas:
+Power =  PWR_PKG_ENERGY / time
+Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/collectors/likwid/groups/skylakeX/CYCLE_ACTIVITY.txt b/collectors/likwid/groups/skylakeX/CYCLE_ACTIVITY.txt
new file mode 100644
index 0000000..c432a44
--- /dev/null
+++ b/collectors/likwid/groups/skylakeX/CYCLE_ACTIVITY.txt
@@ -0,0 +1,38 @@
+SHORT Cycle Activities
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING
+PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING
+PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING
+PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Cycles without execution [%] (PMC3/FIXC1)*100
+Cycles without execution due to L1D [%] (PMC2/FIXC1)*100
+Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
+Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100
+
+LONG
+Formulas:
+Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
+Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
+Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
+Cycles without execution due to memory loads [%] = CYCLE_ACTIVITY_CYCLES_LDM_PENDING/CPU_CLK_UNHALTED_CORE*100
+--
+This performance group measures the cycles while waiting for data from the cache
+and memory hierarchy.
+CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on
+any execution port.
+CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is
+outstanding.
+CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is
+outstanding.
+CYCLE_ACTIVITY_CYCLES_LDM_PENDING: Cycles while memory subsystem has an
+outstanding load.
diff --git a/collectors/likwid/groups/skylakeX/CYCLE_STALLS.txt b/collectors/likwid/groups/skylakeX/CYCLE_STALLS.txt
new file mode 100644
index 0000000..795aeb9
--- /dev/null
+++ b/collectors/likwid/groups/skylakeX/CYCLE_STALLS.txt
@@ -0,0 +1,45 @@
+SHORT Cycle Activities (Stalls)
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING
+PMC1 CYCLE_ACTIVITY_STALLS_LDM_PENDING
+PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING
+PMC3 CYCLE_ACTIVITY_STALLS_TOTAL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Total execution stalls PMC3
+Stalls caused by L1D misses [%] (PMC2/PMC3)*100
+Stalls caused by L2 misses [%] (PMC0/PMC3)*100
+Stalls caused by memory loads [%] (PMC1/PMC3)*100
+Execution stall rate [%] (PMC3/FIXC1)*100
+Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100
+Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
+Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100
+
+LONG
+Formulas:
+Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
+Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Stalls caused by memory loads [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
+Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100
+Stalls caused by memory loads rate [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CPU_CLK_UNHALTED_CORE)*100
+--
+This performance group measures the stalls caused by data traffic in the cache
+hierarchy.
+CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls.
+CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand
+load is outstanding.
+CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand
+load is outstanding.
+CYCLE_ACTIVITY_STALLS_LDM_PENDING: Execution stalls while memory subsystem has
+an outstanding load.
diff --git a/collectors/likwid/groups/skylakeX/DATA.txt b/collectors/likwid/groups/skylakeX/DATA.txt
new file mode 100644
index 0000000..4e6e938
--- /dev/null
+++ b/collectors/likwid/groups/skylakeX/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_INST_RETIRED_ALL_LOADS
+PMC1  MEM_INST_RETIRED_ALL_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_INST_RETIRED_ALL_LOADS/MEM_INST_RETIRED_ALL_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/skylakeX/DIVIDE.txt b/collectors/likwid/groups/skylakeX/DIVIDE.txt
new file mode 100644
index 0000000..40b4ab6
--- /dev/null
+++ b/collectors/likwid/groups/skylakeX/DIVIDE.txt
@@ -0,0 +1,24 @@
+SHORT Divide unit information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ARITH_DIVIDER_COUNT
+PMC1  ARITH_DIVIDER_ACTIVE
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Number of divide ops PMC0
+Avg. divide unit usage duration PMC1/PMC0
+
+LONG
+Formulas:
+Number of divide ops = ARITH_DIVIDER_COUNT
+Avg. divide unit usage duration = ARITH_DIVIDER_ACTIVE/ARITH_DIVIDER_COUNT
+-
+This performance group measures the average latency of divide operations
diff --git a/collectors/likwid/groups/skylakeX/ENERGY.txt b/collectors/likwid/groups/skylakeX/ENERGY.txt
new file mode 100644
index 0000000..fe7829f
--- /dev/null
+++ b/collectors/likwid/groups/skylakeX/ENERGY.txt
@@ -0,0 +1,35 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formulas:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket)  and DRAM level.
+
diff --git a/collectors/likwid/groups/skylakeX/FLOPS_AVX.txt b/collectors/likwid/groups/skylakeX/FLOPS_AVX.txt
new file mode 100644
index 0000000..e44a913
--- /dev/null
+++ b/collectors/likwid/groups/skylakeX/FLOPS_AVX.txt
@@ -0,0 +1,25 @@
+SHORT Packed AVX MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
+PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Packed SP [MFLOP/s]  1.0E-06*(PMC0*8.0+PMC2*16.0)/time
+Packed DP [MFLOP/s]  1.0E-06*(PMC1*4.0+PMC3*8.0)/time
+
+LONG
+Formulas:
+Packed SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+Packed DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+-
+Packed 32b AVX FLOPs rates.
diff --git a/collectors/likwid/groups/skylakeX/FLOPS_DP.txt b/collectors/likwid/groups/skylakeX/FLOPS_DP.txt
new file mode 100644
index 0000000..177cff2
--- /dev/null
+++ b/collectors/likwid/groups/skylakeX/FLOPS_DP.txt
@@ -0,0 +1,34 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time
+AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time
+AVX512 DP [MFLOP/s]  1.0E-06*(PMC3*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+AVX512 DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+Vectorization ratio = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)
+-
+SSE scalar and packed double precision FLOP rates.
+
diff --git a/collectors/likwid/groups/skylakeX/FLOPS_SP.txt b/collectors/likwid/groups/skylakeX/FLOPS_SP.txt
new file mode 100644
index 0000000..01d98c2
--- /dev/null
+++ b/collectors/likwid/groups/skylakeX/FLOPS_SP.txt
@@ -0,0 +1,34 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time
+AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time
+AVX512 SP [MFLOP/s]  1.0E-06*(PMC3*16.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Vectorization ratio 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3)
+
+LONG
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+AVX512 SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)
+-
+SSE scalar and packed single precision FLOP rates.
+
diff --git a/collectors/likwid/groups/skylakeX/L2.txt b/collectors/likwid/groups/skylakeX/L2.txt
new file mode 100644
index 0000000..1a92a95
--- /dev/null
+++ b/collectors/likwid/groups/skylakeX/L2.txt
@@ -0,0 +1,38 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L1D_M_EVICT
+PMC2  ICACHE_64B_IFTAG_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_64B_IFTAG_MISS)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_64B_IFTAG_MISS)*64
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L1 and the number of modified cache lines
+evicted from the L1. The group also output total data volume transferred between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
+
diff --git a/collectors/likwid/groups/skylakeX/L2CACHE.txt b/collectors/likwid/groups/skylakeX/L2CACHE.txt
new file mode 100644
index 0000000..9b5dd4b
--- /dev/null
+++ b/collectors/likwid/groups/skylakeX/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_TRANS_ALL_REQUESTS
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the# data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/skylakeX/L3.txt b/collectors/likwid/groups/skylakeX/L3.txt
new file mode 100644
index 0000000..219f932
--- /dev/null
+++ b/collectors/likwid/groups/skylakeX/L3.txt
@@ -0,0 +1,48 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ALL
+PMC1  L2_TRANS_L2_WB
+PMC2  IDI_MISC_WB_DOWNGRADE
+PMC3  IDI_MISC_WB_UPGRADE
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC3*64.0
+L3|MEM evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3|MEM evict data volume [GBytes]  1.0E-09*PMC1*64.0
+Dropped CLs bandwidth [MBytes/s] 1.0E-6*PMC2*64.0/time
+Dropped CLs data volume [GBytes] 1.0E-9*PMC2*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*IDI_MISC_WB_UPGRADE*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*IDI_MISC_WB_UPGRADE*64.0
+Dropped CLs bandwidth [MBytes/s] = 1.0E-6*IDI_MISC_WB_DOWNGRADE*64.0/time
+Dropped CLs data volume [GBytes] = 1.0E-9*IDI_MISC_WB_DOWNGRADE*64.0
+L3|MEM evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time
+L3|MEM evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
+--
+Profiling group to measure L3 cache bandwidth and data volume. For Intel Skylake
+or Cascadelake, the L3 is a victim cache. This means that all data is loaded
+from memory directly into the L2 cache (if L3 prefetcher is inactive). Modified
+data in L2 is evicted to L3 (additional data transfer due to non-inclusivenss of
+L3 can be measured). Clean cache lines (only loaded data) might get dropped in
+L2 to reduce traffic. If amount of clean cache lines is smaller than L3, it
+might be evicted to L3 due to some heuristic.
diff --git a/collectors/likwid/groups/skylakeX/L3CACHE.txt b/collectors/likwid/groups/skylakeX/L3CACHE.txt
new file mode 100644
index 0000000..bc664d1
--- /dev/null
+++ b/collectors/likwid/groups/skylakeX/L3CACHE.txt
@@ -0,0 +1,35 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_LOAD_RETIRED_L3_HIT
+PMC1  MEM_LOAD_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate (PMC0+PMC1)/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/(PMC0+PMC1)
+
+LONG
+Formulas:
+L3 request rate = (MEM_LOAD_RETIRED_L3_HIT+MEM_LOAD_RETIRED_L3_MISS)/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/(MEM_LOAD_RETIRED_L3_HIT+MEM_LOAD_RETIRED_L3_MISS)
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/skylakeX/MEM.txt b/collectors/likwid/groups/skylakeX/MEM.txt
new file mode 100644
index 0000000..3d50ecb
--- /dev/null
+++ b/collectors/likwid/groups/skylakeX/MEM.txt
@@ -0,0 +1,48 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on a
+per socket base. Some of the counters may not be available on your system.
+Also outputs total data volume transferred from main memory.
+The same metrics are provided by the HA group.
+
diff --git a/collectors/likwid/groups/skylakeX/MEM_DP.txt b/collectors/likwid/groups/skylakeX/MEM_DP.txt
new file mode 100644
index 0000000..d6e481a
--- /dev/null
+++ b/collectors/likwid/groups/skylakeX/MEM_DP.txt
@@ -0,0 +1,70 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time
+AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
+Operational intensity (PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0)
+
+LONG
+Formulas:
+Power [W] = PWR_PKG_ENERGY/runtime
+Power DRAM [W] = PWR_DRAM_ENERGY/runtime
+DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+AVX DP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
+Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0)
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed double precision FLOP rates. Also reports on packed AVX
+32b instructions.
+The operational intensity is calculated using the FP values of the cores and the
+memory data volume of the whole socket. The actual operational intensity for
+multiple CPUs can be found in the statistics table in the Sum column.
diff --git a/collectors/likwid/groups/skylakeX/MEM_SP.txt b/collectors/likwid/groups/skylakeX/MEM_SP.txt
new file mode 100644
index 0000000..5720938
--- /dev/null
+++ b/collectors/likwid/groups/skylakeX/MEM_SP.txt
@@ -0,0 +1,70 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+PMC3  FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+SP [MFLOP/s]  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time
+AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time
+Packed [MUOPS/s]   1.0E-06*(PMC0+PMC2+PMC3)/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0
+Operational intensity (PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0)
+
+LONG
+Formulas:
+Power [W] = PWR_PKG_ENERGY/runtime
+Power DRAM [W] = PWR_DRAM_ENERGY/runtime
+SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+AVX SP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/runtime
+Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
+Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0)
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transferred from main memory.
+SSE scalar and packed single precision FLOP rates. Also reports on packed AVX
+32b instructions.
+The operational intensity is calculated using the FP values of the cores and the
+memory data volume of the whole socket. The actual operational intensity for
+multiple CPUs can be found in the statistics table in the Sum column.
diff --git a/collectors/likwid/groups/skylakeX/TLB_DATA.txt b/collectors/likwid/groups/skylakeX/TLB_DATA.txt
new file mode 100644
index 0000000..10ee5e1
--- /dev/null
+++ b/collectors/likwid/groups/skylakeX/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1  DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2  DTLB_LOAD_MISSES_WALK_ACTIVE
+PMC3  DTLB_STORE_MISSES_WALK_ACTIVE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses     PMC1
+L1 DTLB store miss rate  PMC1/FIXC0
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_ACTIVE / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_ACTIVE / DTLB_STORE_MISSES_CAUSES_A_WALK
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/skylakeX/TLB_INSTR.txt b/collectors/likwid/groups/skylakeX/TLB_INSTR.txt
new file mode 100644
index 0000000..9bc65a7
--- /dev/null
+++ b/collectors/likwid/groups/skylakeX/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_CAUSES_A_WALK
+PMC1  ITLB_MISSES_WALK_ACTIVE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_ACTIVE / ITLB_MISSES_CAUSES_A_WALK
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/skylakeX/TMA.txt b/collectors/likwid/groups/skylakeX/TMA.txt
new file mode 100644
index 0000000..afb4126
--- /dev/null
+++ b/collectors/likwid/groups/skylakeX/TMA.txt
@@ -0,0 +1,48 @@
+SHORT Top down cycle allocation
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 UOPS_ISSUED_ANY
+PMC1 UOPS_RETIRED_RETIRE_SLOTS
+PMC2 IDQ_UOPS_NOT_DELIVERED_CORE
+PMC3 INT_MISC_RECOVERY_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+IPC FIXC0/FIXC1
+Total Slots 4*FIXC1
+Slots Retired PMC1
+Fetch Bubbles PMC2
+Recovery Bubbles 4*PMC3
+Front End [%] PMC2/(4*FIXC1)*100
+Speculation [%] (PMC0-PMC1+(4*PMC3))/(4*FIXC1)*100
+Retiring [%] PMC1/(4*FIXC1)*100
+Back End [%] (1-((PMC2+PMC0+(4*PMC3))/(4*FIXC1)))*100
+
+LONG
+Formulas:
+Total Slots = 4*CPU_CLK_UNHALTED_CORE
+Slots Retired = UOPS_RETIRED_RETIRE_SLOTS
+Fetch Bubbles = IDQ_UOPS_NOT_DELIVERED_CORE
+Recovery Bubbles = 4*INT_MISC_RECOVERY_CYCLES
+Front End [%] = IDQ_UOPS_NOT_DELIVERED_CORE/(4*CPU_CLK_UNHALTED_CORE)*100
+Speculation [%] = (UOPS_ISSUED_ANY-UOPS_RETIRED_RETIRE_SLOTS+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)*100
+Retiring [%] = UOPS_RETIRED_RETIRE_SLOTS/(4*CPU_CLK_UNHALTED_CORE)*100
+Back End [%] = (1-((IDQ_UOPS_NOT_DELIVERED_CORE+UOPS_ISSUED_ANY+(4*INT_MISC_RECOVERY_CYCLES))/(4*CPU_CLK_UNHALTED_CORE)))*100
+--
+This performance group measures cycles to determine percentage of time spent in
+front end, back end, retiring and speculation. These metrics are published and
+verified by Intel. Further information:
+Webpage describing Top-Down Method and its usage in Intel vTune:
+https://software.intel.com/en-us/vtune-amplifier-help-tuning-applications-using-a-top-down-microarchitecture-analysis-method
+Paper by Yasin Ahmad:
+https://sites.google.com/site/analysismethods/yasin-pubs/TopDown-Yasin-ISPASS14.pdf?attredirects=0
+Slides by Yasin Ahmad:
+http://www.cs.technion.ac.il/~erangi/TMA_using_Linux_perf__Ahmad_Yasin.pdf
+The performance group was originally published here:
+http://perf.mvermeulen.com/2018/04/14/top-down-performance-counter-analysis-part-1-likwid/
diff --git a/collectors/likwid/groups/skylakeX/UOPS_EXEC.txt b/collectors/likwid/groups/skylakeX/UOPS_EXEC.txt
new file mode 100644
index 0000000..7042df7
--- /dev/null
+++ b/collectors/likwid/groups/skylakeX/UOPS_EXEC.txt
@@ -0,0 +1,31 @@
+SHORT UOPs execution
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_EXECUTED_USED_CYCLES
+PMC1  UOPS_EXECUTED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_EXECUTED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_EXECUTED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_EXECUTED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_EXECUTED_STALL_CYCLES/UOPS_EXECUTED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the execution stage in the pipeline. Used cycles are all cycles where uOPs are
+executed while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/collectors/likwid/groups/skylakeX/UOPS_ISSUE.txt b/collectors/likwid/groups/skylakeX/UOPS_ISSUE.txt
new file mode 100644
index 0000000..9aac923
--- /dev/null
+++ b/collectors/likwid/groups/skylakeX/UOPS_ISSUE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs issueing
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_USED_CYCLES
+PMC1  UOPS_ISSUED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_ISSUED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_ISSUED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_ISSUED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_ISSUED_STALL_CYCLES/UOPS_ISSUED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the issue stage in the pipeline. Used cycles are all cycles where uOPs are
+issued while unused cycles refer to pipeline stalls. Moreover, the group
+calculates the average stall duration in cycles.
diff --git a/collectors/likwid/groups/skylakeX/UOPS_RETIRE.txt b/collectors/likwid/groups/skylakeX/UOPS_RETIRE.txt
new file mode 100644
index 0000000..0f37585
--- /dev/null
+++ b/collectors/likwid/groups/skylakeX/UOPS_RETIRE.txt
@@ -0,0 +1,31 @@
+SHORT UOPs retirement
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_RETIRED_USED_CYCLES
+PMC1  UOPS_RETIRED_STALL_CYCLES
+PMC2  CPU_CLOCK_UNHALTED_TOTAL_CYCLES
+PMC3:EDGEDETECT  UOPS_RETIRED_STALL_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Used cycles ratio [%] 100*PMC0/PMC2
+Unused cycles ratio [%] 100*PMC1/PMC2
+Avg stall duration [cycles] PMC1/PMC3:EDGEDETECT
+
+
+LONG
+Formulas:
+Used cycles ratio [%] = 100*UOPS_RETIRED_USED_CYCLES/CPU_CLK_UNHALTED_CORE
+Unused cycles ratio [%] = 100*UOPS_RETIRED_STALL_CYCLES/CPU_CLK_UNHALTED_CORE
+Avg stall duration [cycles] = UOPS_RETIRED_STALL_CYCLES/UOPS_RETIRED_STALL_CYCLES:EDGEDETECT
+-
+This performance group returns the ratios of used and unused cycles regarding
+the retirement stage in the pipeline (re-order buffer). Used cycles are all
+cycles where uOPs are retired while unused cycles refer to pipeline stalls.
+Moreover, the group calculates the average stall duration in cycles.
diff --git a/collectors/likwid/groups/skylakeX/UPI.txt b/collectors/likwid/groups/skylakeX/UPI.txt
new file mode 100644
index 0000000..2a4c44f
--- /dev/null
+++ b/collectors/likwid/groups/skylakeX/UPI.txt
@@ -0,0 +1,42 @@
+SHORT UPI data traffic
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+SBOX0C0 TXL_FLITS_ALL_DATA
+SBOX0C1 RXL_FLITS_ALL_DATA
+SBOX1C0 TXL_FLITS_ALL_DATA
+SBOX1C1 RXL_FLITS_ALL_DATA
+SBOX2C0 TXL_FLITS_ALL_DATA
+SBOX2C1 RXL_FLITS_ALL_DATA
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Received data bandwidth [MByte/s] 1.0E-06*((SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0/time
+Received data volume [GByte] 1.0E-09*((SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0
+Sent data bandwidth [MByte/s] 1.0E-06*((SBOX0C0+SBOX1C0+SBOX2C0)/9.0)*64.0/time
+Sent data volume [GByte] 1.0E-09*((SBOX0C0+SBOX1C0+SBOX2C0)/9.0)*64.0
+Total data bandwidth [MByte/s] 1.0E-06*((SBOX0C0+SBOX1C0+SBOX2C0+SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0/time
+Total data volume [GByte] 1.0E-09*((SBOX0C0+SBOX1C0+SBOX2C0+SBOX0C1+SBOX1C1+SBOX2C1)/9.0)*64.0
+
+
+LONG
+Formulas:
+Received data bandwidth [MByte/s] = 1.0E-06*(SUM(RXL_FLITS_ALL_DATA)/9.0)*64.0/runtime
+Received data volume [GByte] = 1.0E-09*(SUM(RXL_FLITS_ALL_DATA)/9.0)*64.0
+Sent data bandwidth [MByte/s] = 1.0E-06*(SUM(TXL_FLITS_ALL_DATA)/9.0)*64.0/time
+Sent data volume [GByte] = 1.0E-09*(SUM(TXL_FLITS_ALL_DATA)/9.0)*64.0
+Total data bandwidth [MByte/s] = 1.0E-06*((SUM(RXL_FLITS_ALL_DATA)+SUM(TXL_FLITS_ALL_DATA))/9.0)*64.0/time
+Total data volume [GByte] = 1.0E-09*((SUM(RXL_FLITS_ALL_DATA)+SUM(TXL_FLITS_ALL_DATA))/9.0)*64.0
+--
+This group measures the data traffic on the UPI (socket interconnect). The group
+measures all filled data slots (9 slots per 64 Byte data transfer), that's why
+the count needs to be divided by 9. These 9 data chunks are not transferred in
+a single flit but there is one flit for the header and three flits for the data.
+The metrics show higher values as expected because the events count also
+different transfers which include data.
diff --git a/collectors/likwid/groups/westmere/BRANCH.txt b/collectors/likwid/groups/westmere/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/collectors/likwid/groups/westmere/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/westmere/CACHE.txt b/collectors/likwid/groups/westmere/CACHE.txt
new file mode 100644
index 0000000..6a5e4fe
--- /dev/null
+++ b/collectors/likwid/groups/westmere/CACHE.txt
@@ -0,0 +1,26 @@
+SHORT Data cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+data cache misses PMC0
+data cache miss rate PMC0/FIXC0
+
+LONG
+Formulas:
+data cache misses = L1D_REPL
+data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY
+-
+This group measures the locality of your data accesses with regard to the
+L1 cache.
+The data cache miss rate gives a measure how often it was necessary to get
+cache lines from higher levels of the memory hierarchy.
+
diff --git a/collectors/likwid/groups/westmere/CLOCK.txt b/collectors/likwid/groups/westmere/CLOCK.txt
new file mode 100644
index 0000000..5f862a5
--- /dev/null
+++ b/collectors/likwid/groups/westmere/CLOCK.txt
@@ -0,0 +1,21 @@
+SHORT CPU clock information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+
+LONG
+Formulas:
+Runtime (RDTSC) [s] = time
+Runtime unhalted [s] = CPU_CLK_UNHALTED_CORE*inverseClock
+Clock [MHz] = 1.E-06*(CPU_CLK_UNHALTED_CORE/CPU_CLK_UNHALTED_REF)/inverseClock
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+-
+CPU clock information
diff --git a/collectors/likwid/groups/westmere/DATA.txt b/collectors/likwid/groups/westmere/DATA.txt
new file mode 100644
index 0000000..31bba51
--- /dev/null
+++ b/collectors/likwid/groups/westmere/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_INST_RETIRED_LOADS
+PMC1  MEM_INST_RETIRED_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_INST_RETIRED_LOADS/MEM_INST_RETIRED_STORES
+-
+This is a simple metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/westmere/DIVIDE.txt b/collectors/likwid/groups/westmere/DIVIDE.txt
new file mode 100644
index 0000000..2677a19
--- /dev/null
+++ b/collectors/likwid/groups/westmere/DIVIDE.txt
@@ -0,0 +1,24 @@
+SHORT Divide unit information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ARITH_NUM_DIV
+PMC1  ARITH_CYCLES_DIV_BUSY
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Number of divide ops PMC0
+Avg. divide unit usage duration PMC1/PMC0
+
+LONG
+Formulas:
+Number of divide ops = ARITH_NUM_DIV
+Avg. divide unit usage duration = ARITH_CYCLES_DIV_BUSY/ARITH_NUM_DIV
+-
+This performance group measures the average latency of divide operations
diff --git a/collectors/likwid/groups/westmere/FLOPS_DP.txt b/collectors/likwid/groups/westmere/FLOPS_DP.txt
new file mode 100644
index 0000000..c5c8203
--- /dev/null
+++ b/collectors/likwid/groups/westmere/FLOPS_DP.txt
@@ -0,0 +1,35 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR
+PMC2  FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
+PMC3  FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+SP [MUOPS/s] 1.0E-06*PMC2/time
+DP [MUOPS/s] 1.0E-06*PMC3/time
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
+-
+Westmere has no possibility to measure MFLOPs if mixed precision calculations are done.
+Therefore both single as well as double precision are measured to ensure the correctness
+of the measurements. You can check if your code was vectorized on the number of
+FP_COMP_OPS_EXE_SSE_FP_PACKED versus the  FP_COMP_OPS_EXE_SSE_FP_SCALAR.
+
diff --git a/collectors/likwid/groups/westmere/FLOPS_SP.txt b/collectors/likwid/groups/westmere/FLOPS_SP.txt
new file mode 100644
index 0000000..933b058
--- /dev/null
+++ b/collectors/likwid/groups/westmere/FLOPS_SP.txt
@@ -0,0 +1,35 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR
+PMC2  FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
+PMC3  FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+SP [MUOPS/s] 1.0E-06*PMC2/time
+DP [MUOPS/s] 1.0E-06*PMC3/time
+
+LONG
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
+-
+Westmere has no possibility to measure MFLOPs if mixed precision calculations are done.
+Therefore both single as well as double precision are measured to ensure the correctness
+of the measurements. You can check if your code was vectorized on the number of
+FP_COMP_OPS_EXE_SSE_FP_PACKED versus the  FP_COMP_OPS_EXE_SSE_FP_SCALAR.
+
diff --git a/collectors/likwid/groups/westmere/FLOPS_X87.txt b/collectors/likwid/groups/westmere/FLOPS_X87.txt
new file mode 100644
index 0000000..39cd8b4
--- /dev/null
+++ b/collectors/likwid/groups/westmere/FLOPS_X87.txt
@@ -0,0 +1,21 @@
+SHORT X87 MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  INST_RETIRED_X87
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+X87 [MFLOP/s]  1.0E-06*PMC0/time
+
+LONG
+Formulas:
+X87 [MFLOP/s] = 1.0E-06*INST_RETIRED_X87/runtime
+-
+Profiling group to measure X87 FLOP rate.
+
diff --git a/collectors/likwid/groups/westmere/ICACHE.txt b/collectors/likwid/groups/westmere/ICACHE.txt
new file mode 100644
index 0000000..49943ff
--- /dev/null
+++ b/collectors/likwid/groups/westmere/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1I_READS
+PMC1  L1I_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = L1I_READS / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / L1I_READS
+-
+This group measures some L1 instruction cache metrics.
diff --git a/collectors/likwid/groups/westmere/L2.txt b/collectors/likwid/groups/westmere/L2.txt
new file mode 100644
index 0000000..74f7d58
--- /dev/null
+++ b/collectors/likwid/groups/westmere/L2.txt
@@ -0,0 +1,38 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPL
+PMC1  L1D_M_EVICT
+PMC2  L1I_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L1 and the number of modified cache lines
+evicted from the L1. The group also reports of data volume transferred between
+L2 and L1 cache. Note that this bandwidth also includes data transfers due to a
+write allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
+
diff --git a/collectors/likwid/groups/westmere/L2CACHE.txt b/collectors/likwid/groups/westmere/L2CACHE.txt
new file mode 100644
index 0000000..343b263
--- /dev/null
+++ b/collectors/likwid/groups/westmere/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_RQSTS_REFERENCES
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_RQSTS_REFERENCES/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_RQSTS_REFERENCES
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/westmere/L3.txt b/collectors/likwid/groups/westmere/L3.txt
new file mode 100644
index 0000000..a1d95e3
--- /dev/null
+++ b/collectors/likwid/groups/westmere/L3.txt
@@ -0,0 +1,37 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_RQSTS_MISS
+PMC1  L2_LINES_OUT_DIRTY_ANY
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*(PMC1)*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*(PMC1)*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_RQSTS_MISS*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_RQSTS_MISS*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DIRTY_ANY*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DIRTY_ANY*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_RQSTS_MISS+L2_LINES_OUT_DIRTY_ANY)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_RQSTS_MISS+L2_LINES_OUT_DIRTY_ANY)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L2 and the number of modified cache lines
+evicted from the L2. The group also reports total data volume between L3 and
+the measured L2 cache. Note that this bandwidth also includes data transfers
+due to a write allocate load on a store miss in L2.
+
diff --git a/collectors/likwid/groups/westmere/L3CACHE.txt b/collectors/likwid/groups/westmere/L3CACHE.txt
new file mode 100644
index 0000000..58072c1
--- /dev/null
+++ b/collectors/likwid/groups/westmere/L3CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0  INSTR_RETIRED_ANY
+FIXC1  CPU_CLK_UNHALTED_CORE
+FIXC2  CPU_CLK_UNHALTED_REF
+UPMC0  UNC_L3_HITS_ANY
+UPMC1  UNC_L3_MISS_ANY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate   (UPMC0+UPMC1)/FIXC0
+L3 miss rate   UPMC1/FIXC0
+L3 miss ratio  UPMC1/(UPMC0+UPMC1)
+
+LONG
+Formulas:
+L3 request rate = (UNC_L3_HITS_ANY+UNC_L3_MISS_ANY)/INSTR_RETIRED_ANY
+L3 miss rate = UNC_L3_MISS_ANY/INSTR_RETIRED_ANY
+L3 miss ratio = UNC_L3_MISS_ANY/(UNC_L3_HITS_ANY+UNC_L3_MISS_ANY)
+-
+This group measures the locality of your data accesses with regard to the L3
+Cache. L3 request rate tells you how data intensive your code is or how many
+data accesses you have on average per instruction. The L3 miss rate gives a
+measure how often it was necessary to get cache lines from memory. And finally
+L3 miss ratio tells you how many of your memory references required a cache line
+to be loaded from a higher level. While the data cache miss rate might be given
+by your algorithm you should try to get data cache miss ratio as low as
+possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/westmere/MEM.txt b/collectors/likwid/groups/westmere/MEM.txt
new file mode 100644
index 0000000..b5165e1
--- /dev/null
+++ b/collectors/likwid/groups/westmere/MEM.txt
@@ -0,0 +1,50 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0  INSTR_RETIRED_ANY
+FIXC1  CPU_CLK_UNHALTED_CORE
+FIXC2  CPU_CLK_UNHALTED_REF
+UPMC0  UNC_QMC_NORMAL_READS_ANY
+UPMC1  UNC_QMC_WRITES_FULL_ANY
+UPMC2  UNC_QHL_REQUESTS_REMOTE_READS
+UPMC3  UNC_QHL_REQUESTS_REMOTE_WRITES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*UPMC0*64.0/time
+Memory read data volume [GBytes] 1.0E-09*UPMC0*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*UPMC1*64.0/time
+Memory write data volume [GBytes] 1.0E-09*UPMC1*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64.0
+Remote memory read bandwidth [MBytes/s] 1.0E-06*UPMC2*64.0/time
+Remote memory read data volume [GBytes] 1.0E-09*UPMC2*64.0
+Remote memory write bandwidth [MBytes/s] 1.0E-06*UPMC3*64.0/time
+Remote memory write data volume [GBytes] 1.0E-09*UPMC3*64.0
+Remote memory bandwidth [MBytes/s] 1.0E-06*(UPMC2+UPMC3)*64.0/time
+Remote memory data volume [GBytes] 1.0E-09*(UPMC2+UPMC3)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_NORMAL_READS_ANY*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*UNC_QMC_NORMAL_READS_ANY*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_WRITES_FULL_ANY*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*UNC_QMC_WRITES_FULL_ANY*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0
+Remote memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_READS*64.0/time
+Remote memory read data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_READS*64.0
+Remote memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0/time
+Remote memory write data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0
+Remote memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0/time
+Remote memory data volume [GBytes] = 1.0E-09*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+This group will be measured by one core per socket. The remote read BW tells
+you if cache lines are transferred between sockets, meaning that cores access
+data owned by a remote NUMA domain. The group also reports total data volume
+transferred from main memory.
+
diff --git a/collectors/likwid/groups/westmere/MEM_DP.txt b/collectors/likwid/groups/westmere/MEM_DP.txt
new file mode 100644
index 0000000..64161dd
--- /dev/null
+++ b/collectors/likwid/groups/westmere/MEM_DP.txt
@@ -0,0 +1,66 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0  INSTR_RETIRED_ANY
+FIXC1  CPU_CLK_UNHALTED_CORE
+FIXC2  CPU_CLK_UNHALTED_REF
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR
+PMC2  FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
+PMC3  FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
+UPMC0  UNC_QMC_NORMAL_READS_ANY
+UPMC1  UNC_QMC_WRITES_FULL_ANY
+UPMC2  UNC_QHL_REQUESTS_REMOTE_READS
+UPMC3  UNC_QHL_REQUESTS_REMOTE_WRITES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+SP [MUOPS/s] 1.0E-06*PMC2/time
+DP [MUOPS/s] 1.0E-06*PMC3/time
+Memory read bandwidth [MBytes/s] 1.0E-06*UPMC0*64.0/time
+Memory read data volume [GBytes] 1.0E-09*UPMC0*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*UPMC1*64.0/time
+Memory write data volume [GBytes] 1.0E-09*UPMC1*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64.0
+Remote memory read bandwidth [MBytes/s] 1.0E-06*UPMC2*64.0/time
+Remote memory read data volume [GBytes] 1.0E-09*UPMC2*64.0
+Remote memory write bandwidth [MBytes/s] 1.0E-06*UPMC3*64.0/time
+Remote memory write data volume [GBytes] 1.0E-09*UPMC3*64.0
+Remote memory bandwidth [MBytes/s] 1.0E-06*(UPMC2+UPMC3)*64.0/time
+Remote memory data volume [GBytes] 1.0E-09*(UPMC2+UPMC3)*64.0
+Operational intensity (PMC0*2.0+PMC1)/((UPMC0+UPMC1)*64.0)
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_NORMAL_READS_ANY*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*UNC_QMC_NORMAL_READS_ANY*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_WRITES_FULL_ANY*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*UNC_QMC_WRITES_FULL_ANY*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0
+Remote memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_READS*64.0/time
+Remote memory read data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_READS*64.0
+Remote memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0/time
+Remote memory write data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0
+Remote memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0/time
+Remote memory data volume [GBytes] = 1.0E-09*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0
+Operational intensity = (FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/((UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0)
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+This group will be measured by one core per socket. The remote read BW tells
+you if cache lines are transferred between sockets, meaning that cores access
+data owned by a remote NUMA domain. The group also reports total data volume
+transferred from main memory.
+
diff --git a/collectors/likwid/groups/westmere/MEM_SP.txt b/collectors/likwid/groups/westmere/MEM_SP.txt
new file mode 100644
index 0000000..812c7fa
--- /dev/null
+++ b/collectors/likwid/groups/westmere/MEM_SP.txt
@@ -0,0 +1,66 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0  INSTR_RETIRED_ANY
+FIXC1  CPU_CLK_UNHALTED_CORE
+FIXC2  CPU_CLK_UNHALTED_REF
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR
+PMC2  FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
+PMC3  FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
+UPMC0  UNC_QMC_NORMAL_READS_ANY
+UPMC1  UNC_QMC_WRITES_FULL_ANY
+UPMC2  UNC_QHL_REQUESTS_REMOTE_READS
+UPMC3  UNC_QHL_REQUESTS_REMOTE_WRITES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+SP [MUOPS/s] 1.0E-06*PMC2/time
+DP [MUOPS/s] 1.0E-06*PMC3/time
+Memory read bandwidth [MBytes/s] 1.0E-06*UPMC0*64.0/time
+Memory read data volume [GBytes] 1.0E-09*UPMC0*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*UPMC1*64.0/time
+Memory write data volume [GBytes] 1.0E-09*UPMC1*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64.0
+Remote memory read bandwidth [MBytes/s] 1.0E-06*UPMC2*64.0/time
+Remote memory read data volume [GBytes] 1.0E-09*UPMC2*64.0
+Remote memory write bandwidth [MBytes/s] 1.0E-06*UPMC3*64.0/time
+Remote memory write data volume [GBytes] 1.0E-09*UPMC3*64.0
+Remote memory bandwidth [MBytes/s] 1.0E-06*(UPMC2+UPMC3)*64.0/time
+Remote memory data volume [GBytes] 1.0E-09*(UPMC2+UPMC3)*64.0
+Operational intensity (PMC0*4.0+PMC1)/((UPMC0+UPMC1)*64.0)
+
+LONG
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_NORMAL_READS_ANY*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*UNC_QMC_NORMAL_READS_ANY*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_WRITES_FULL_ANY*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*UNC_QMC_WRITES_FULL_ANY*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0
+Remote memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_READS*64.0/time
+Remote memory read data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_READS*64.0
+Remote memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0/time
+Remote memory write data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0
+Remote memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0/time
+Remote memory data volume [GBytes] = 1.0E-09*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0
+Operational intensity = (FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/((UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0)
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+This group will be measured by one core per socket. The remote read BW tells
+you if cache lines are transferred between sockets, meaning that cores access
+data owned by a remote NUMA domain. The group also reports total data volume
+transferred from main memory.
+
diff --git a/collectors/likwid/groups/westmere/TLB_DATA.txt b/collectors/likwid/groups/westmere/TLB_DATA.txt
new file mode 100644
index 0000000..d256b8c
--- /dev/null
+++ b/collectors/likwid/groups/westmere/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_ANY
+PMC1  DTLB_MISSES_ANY
+PMC2  DTLB_LOAD_MISSES_WALK_CYCLES
+PMC3  DTLB_MISSES_WALK_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses     (PMC1-PMC0)
+L1 DTLB store miss rate  (PMC1-PMC0)/FIXC0
+L1 DTLB store miss duration [Cyc] (PMC3-PMC2)/(PMC1-PMC0)
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_ANY
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_ANY / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_CYCLES / DTLB_LOAD_MISSES_ANY
+L1 DTLB store misses = DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY
+L1 DTLB store miss rate = (DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY) / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = (DTLB_MISSES_WALK_CYCLES-DTLB_LOAD_MISSES_WALK_CYCLES) / (DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY)
+-
+The DTLB miss rate gives a measure how often a TLB miss occurred
+per instruction. The store miss calculations are done using ALL-LOADS TLB walks.
+
diff --git a/collectors/likwid/groups/westmere/TLB_INSTR.txt b/collectors/likwid/groups/westmere/TLB_INSTR.txt
new file mode 100644
index 0000000..2f0f90c
--- /dev/null
+++ b/collectors/likwid/groups/westmere/TLB_INSTR.txt
@@ -0,0 +1,27 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_ANY
+PMC1  ITLB_MISSES_WALK_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_ANY
+L1 ITLB miss rate = ITLB_MISSES_ANY / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_CYCLES / ITLB_MISSES_ANY
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/westmere/UOPS.txt b/collectors/likwid/groups/westmere/UOPS.txt
new file mode 100644
index 0000000..b2446aa
--- /dev/null
+++ b/collectors/likwid/groups/westmere/UOPS.txt
@@ -0,0 +1,35 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_ANY
+PMC1  UOPS_EXECUTED_THREAD
+PMC2  UOPS_RETIRED_ANY
+PMC3  UOPS_ISSUED_FUSED
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Issued UOPs PMC0
+Merged UOPs PMC3
+Executed UOPs PMC1
+Retired UOPs PMC2
+
+LONG
+Formulas:
+Issued UOPs = UOPS_ISSUED_ANY
+Merged UOPs = UOPS_ISSUED_FUSED
+Executed UOPs = UOPS_EXECUTED_THREAD
+Retired UOPs = UOPS_RETIRED_ANY
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs and returns the number of uOPs which were issued
+but not executed as well as the number of uOPs which were executed but never retired.
+The executed but not retired uOPs commonly come from speculatively executed branches.
+
diff --git a/collectors/likwid/groups/westmere/VIEW.txt b/collectors/likwid/groups/westmere/VIEW.txt
new file mode 100644
index 0000000..38d907c
--- /dev/null
+++ b/collectors/likwid/groups/westmere/VIEW.txt
@@ -0,0 +1,50 @@
+SHORT Overview of arithmetic and memory performance
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR
+PMC2  FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
+PMC3  FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
+UPMC0  UNC_QMC_NORMAL_READS_ANY
+UPMC1  UNC_QMC_WRITES_FULL_ANY
+UPMC2 UNC_QHL_REQUESTS_REMOTE_READS
+UPMC3 UNC_QHL_REQUESTS_LOCAL_READS
+UPMC4 UNC_QHL_REQUESTS_REMOTE_WRITES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+DP [MFLOP/s] (DP assumed) 1.0E-06*(PMC0*2.0+PMC1)/time
+SP [MFLOP/s] (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+SP [MUOPS/s] 1.0E-06*PMC2/time
+DP [MUOPS/s] 1.0E-06*PMC3/time
+Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64/time
+Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64
+Remote Read BW [MBytes/s] 1.0E-06*(UPMC2)*64/time
+Remote Write BW [MBytes/s] 1.0E-06*(UPMC4)*64/time
+Remote BW [MBytes/s] 1.0E-06*(UPMC2+UPMC4)*64/time
+
+LONG
+Formulas:
+DP [MFLOP/s] =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+SP [MFLOP/s] =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+Packed [MUOPS/s] =  1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/time
+Scalar [MUOPS/s] =  1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/time
+SP [MUOPS/s] =  1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/time
+DP [MUOPS/s] =  1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/time
+Memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64/time
+Memory data volume [GBytes] = 1.0E-09*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64
+Remote Read BW [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS)*64/time
+Remote Write BW [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
+Remote BW [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
+-
+This is a overview group using the capabilities of Westmere to measure multiple events at
+the same time.
+
diff --git a/collectors/likwid/groups/westmereEX/BRANCH.txt b/collectors/likwid/groups/westmereEX/BRANCH.txt
new file mode 100644
index 0000000..b8d41b2
--- /dev/null
+++ b/collectors/likwid/groups/westmereEX/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/westmereEX/CACHE.txt b/collectors/likwid/groups/westmereEX/CACHE.txt
new file mode 100644
index 0000000..eb160f6
--- /dev/null
+++ b/collectors/likwid/groups/westmereEX/CACHE.txt
@@ -0,0 +1,25 @@
+SHORT Data cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+data cache misses PMC0
+data cache miss rate PMC0/FIXC0
+
+LONG
+Formulas:
+data cache misses = L1D_REPL
+data cache miss rate = L1D_REPL / INSTR_RETIRED_ANY
+-
+This group measures the locality of your data accesses with regard to the L1
+cache. The data cache miss rate gives a measure how often it was necessary to
+get cache lines from higher levels of the memory hierarchy.
+
diff --git a/collectors/likwid/groups/westmereEX/DATA.txt b/collectors/likwid/groups/westmereEX/DATA.txt
new file mode 100644
index 0000000..31bba51
--- /dev/null
+++ b/collectors/likwid/groups/westmereEX/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_INST_RETIRED_LOADS
+PMC1  MEM_INST_RETIRED_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_INST_RETIRED_LOADS/MEM_INST_RETIRED_STORES
+-
+This is a simple metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/westmereEX/DIVIDE.txt b/collectors/likwid/groups/westmereEX/DIVIDE.txt
new file mode 100644
index 0000000..2677a19
--- /dev/null
+++ b/collectors/likwid/groups/westmereEX/DIVIDE.txt
@@ -0,0 +1,24 @@
+SHORT Divide unit information
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ARITH_NUM_DIV
+PMC1  ARITH_CYCLES_DIV_BUSY
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Number of divide ops PMC0
+Avg. divide unit usage duration PMC1/PMC0
+
+LONG
+Formulas:
+Number of divide ops = ARITH_NUM_DIV
+Avg. divide unit usage duration = ARITH_CYCLES_DIV_BUSY/ARITH_NUM_DIV
+-
+This performance group measures the average latency of divide operations
diff --git a/collectors/likwid/groups/westmereEX/FLOPS_DP.txt b/collectors/likwid/groups/westmereEX/FLOPS_DP.txt
new file mode 100644
index 0000000..0c2e56c
--- /dev/null
+++ b/collectors/likwid/groups/westmereEX/FLOPS_DP.txt
@@ -0,0 +1,35 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR
+PMC2  FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
+PMC3  FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+DP [MFLOP/s]  1.0E-06*(PMC0*2.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+SP [MUOPS/s] 1.0E-06*PMC2/time
+DP [MUOPS/s] 1.0E-06*PMC3/time
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
+-
+The Nehalem has no possibility to measure MFLOPs if mixed precision calculations are done.
+Therefore both single as well as double precision are measured to ensure the correctness
+of the measurements. You can check if your code was vectorized on the number of
+FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR.
+
diff --git a/collectors/likwid/groups/westmereEX/FLOPS_SP.txt b/collectors/likwid/groups/westmereEX/FLOPS_SP.txt
new file mode 100644
index 0000000..d7c8e8e
--- /dev/null
+++ b/collectors/likwid/groups/westmereEX/FLOPS_SP.txt
@@ -0,0 +1,35 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR
+PMC2  FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
+PMC3  FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1)/time
+Packed [MUOPS/s]   1.0E-06*PMC0/time
+Scalar [MUOPS/s] 1.0E-06*PMC1/time
+SP [MUOPS/s] 1.0E-06*PMC2/time
+DP [MUOPS/s] 1.0E-06*PMC3/time
+
+LONG
+Formulas:
+SP [MFLOP/s] = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP [MUOPS/s] = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
+-
+The Westmere EX has no possibility to measure MFLOPs if mixed precision calculations are done.
+Therefore both single as well as double precision are measured to ensure the correctness
+of the measurements. You can check if your code was vectorized on the number of
+FP_COMP_OPS_EXE_SSE_FP_PACKED versus the FP_COMP_OPS_EXE_SSE_FP_SCALAR.
+
diff --git a/collectors/likwid/groups/westmereEX/FLOPS_X87.txt b/collectors/likwid/groups/westmereEX/FLOPS_X87.txt
new file mode 100644
index 0000000..39cd8b4
--- /dev/null
+++ b/collectors/likwid/groups/westmereEX/FLOPS_X87.txt
@@ -0,0 +1,21 @@
+SHORT X87 MFLOP/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  INST_RETIRED_X87
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+X87 [MFLOP/s]  1.0E-06*PMC0/time
+
+LONG
+Formulas:
+X87 [MFLOP/s] = 1.0E-06*INST_RETIRED_X87/runtime
+-
+Profiling group to measure X87 FLOP rate.
+
diff --git a/collectors/likwid/groups/westmereEX/ICACHE.txt b/collectors/likwid/groups/westmereEX/ICACHE.txt
new file mode 100644
index 0000000..49943ff
--- /dev/null
+++ b/collectors/likwid/groups/westmereEX/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1I_READS
+PMC1  L1I_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = L1I_READS / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / L1I_READS
+-
+This group measures some L1 instruction cache metrics.
diff --git a/collectors/likwid/groups/westmereEX/L2.txt b/collectors/likwid/groups/westmereEX/L2.txt
new file mode 100644
index 0000000..e950021
--- /dev/null
+++ b/collectors/likwid/groups/westmereEX/L2.txt
@@ -0,0 +1,38 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPL
+PMC1  L1D_M_EVICT
+PMC2  L1I_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cache line allocated in the L1 and the number of modified cache lines
+evicted from the L1. Also reports on total data volume transferred between L2
+and L1 cache. Note that this bandwidth also includes data transfers due to a
+write allocate load on a store miss in L1 and traffic caused by misses in the
+instruction cache.
+
diff --git a/collectors/likwid/groups/westmereEX/L2CACHE.txt b/collectors/likwid/groups/westmereEX/L2CACHE.txt
new file mode 100644
index 0000000..343b263
--- /dev/null
+++ b/collectors/likwid/groups/westmereEX/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_RQSTS_REFERENCES
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_RQSTS_REFERENCES/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_RQSTS_REFERENCES
+-
+This group measures the locality of your data accesses with regard to the
+L2 cache. L2 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/westmereEX/L3.txt b/collectors/likwid/groups/westmereEX/L3.txt
new file mode 100644
index 0000000..7e5cb04
--- /dev/null
+++ b/collectors/likwid/groups/westmereEX/L3.txt
@@ -0,0 +1,36 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ANY
+PMC1  L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ANY*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ANY*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DEMAND_DIRTY*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is
+computed by the number of cache line allocated in the L2 and the number of
+modified cache lines evicted from the L2. Also reporto data volume transferred
+between L3 and L2 caches. Note that this bandwidth also includes data transfers
+due to a write allocate load on a store miss in L2.
+
diff --git a/collectors/likwid/groups/westmereEX/L3CACHE.txt b/collectors/likwid/groups/westmereEX/L3CACHE.txt
new file mode 100644
index 0000000..262f948
--- /dev/null
+++ b/collectors/likwid/groups/westmereEX/L3CACHE.txt
@@ -0,0 +1,52 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0  INSTR_RETIRED_ANY
+FIXC1  CPU_CLK_UNHALTED_CORE
+FIXC2  CPU_CLK_UNHALTED_REF
+CBOX0C0 LLC_HITS_ALL
+CBOX0C1 LLC_MISSES_ALL
+CBOX1C0 LLC_HITS_ALL
+CBOX1C1 LLC_MISSES_ALL
+CBOX2C0 LLC_HITS_ALL
+CBOX2C1 LLC_MISSES_ALL
+CBOX3C0 LLC_HITS_ALL
+CBOX3C1 LLC_MISSES_ALL
+CBOX4C0 LLC_HITS_ALL
+CBOX4C1 LLC_MISSES_ALL
+CBOX5C0 LLC_HITS_ALL
+CBOX5C1 LLC_MISSES_ALL
+CBOX6C0 LLC_HITS_ALL
+CBOX6C1 LLC_MISSES_ALL
+CBOX7C0 LLC_HITS_ALL
+CBOX7C1 LLC_MISSES_ALL
+CBOX8C0 LLC_HITS_ALL
+CBOX8C1 LLC_MISSES_ALL
+CBOX9C0 LLC_HITS_ALL
+CBOX9C1 LLC_MISSES_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate   (CBOX0C0+CBOX0C1+CBOX1C0+CBOX1C1+CBOX2C0+CBOX2C1+CBOX3C0+CBOX3C1+CBOX4C0+CBOX4C1+CBOX5C0+CBOX5C1+CBOX6C0+CBOX6C1+CBOX7C0+CBOX7C1+CBOX8C0+CBOX8C1+CBOX9C0+CBOX9C1)/FIXC0
+L3 miss rate   (CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1)/FIXC0
+L3 miss ratio  (CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1)/(CBOX0C0+CBOX0C1+CBOX1C0+CBOX1C1+CBOX2C0+CBOX2C1+CBOX3C0+CBOX3C1+CBOX4C0+CBOX4C1+CBOX5C0+CBOX5C1+CBOX6C0+CBOX6C1+CBOX7C0+CBOX7C1+CBOX8C0+CBOX8C1+CBOX9C0+CBOX9C1)
+
+LONG
+Formulas:
+L3 request rate = (SUM(LLC_HITS_ALL)+SUM(LLC_MISSES_ALL))/INSTR_RETIRED_ANY
+L3 miss rate = SUM(LLC_MISSES_ALL)/INSTR_RETIRED_ANY
+L3 miss ratio = SUM(LLC_MISSES_ALL)/(SUM(LLC_HITS_ALL)+SUM(LLC_MISSES_ALL))
+-
+This group measures the locality of your data accesses with regard to the
+L3 cache. L3 request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cache lines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cache line to be loaded from a higher level.
+While the data cache miss rate might be given by your algorithm you should
+try to get data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/collectors/likwid/groups/westmereEX/MEM.txt b/collectors/likwid/groups/westmereEX/MEM.txt
new file mode 100644
index 0000000..5d4fc62
--- /dev/null
+++ b/collectors/likwid/groups/westmereEX/MEM.txt
@@ -0,0 +1,38 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0   INSTR_RETIRED_ANY
+FIXC1   CPU_CLK_UNHALTED_CORE
+FIXC2   CPU_CLK_UNHALTED_REF
+MBOX0C0 FVC_EV0_BBOX_CMDS_READS
+MBOX0C1 DRAM_CMD_CAS_WR_OPN
+MBOX0C2 DRAM_MISC_CAS_WR_CLS
+MBOX1C0 FVC_EV0_BBOX_CMDS_READS
+MBOX1C1 DRAM_CMD_CAS_WR_OPN
+MBOX1C2 DRAM_MISC_CAS_WR_CLS
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX0C2+MBOX1C1+MBOX1C2)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX0C2+MBOX1C1+MBOX1C2)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1+MBOX0C2+MBOX1C2)*64/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1+MBOX0C2+MBOX1C2)*64
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1)+SUM(MBOXxC2))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1)+SUM(MBOXxC2))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1)+SUM(MBOXxC2))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1)+SUM(MBOXxC2))*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Addional to the bandwidth it also outputs the data volume.
+
diff --git a/collectors/likwid/groups/westmereEX/NUMA.txt b/collectors/likwid/groups/westmereEX/NUMA.txt
new file mode 100644
index 0000000..41fbe62
--- /dev/null
+++ b/collectors/likwid/groups/westmereEX/NUMA.txt
@@ -0,0 +1,33 @@
+SHORT Local and remote memory accesses
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 OFFCORE_RESPONSE_0_LOCAL_DRAM
+PMC1 OFFCORE_RESPONSE_1_REMOTE_DRAM
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local DRAM data volume [GByte]  1.E-09*PMC0*64
+Local DRAM bandwidth [MByte/s]  1.E-06*(PMC0*64)/time
+Remote DRAM data volume [GByte]  1.E-09*PMC1*64
+Remote DRAM bandwidth [MByte/s]  1.E-06*(PMC1*64)/time
+Memory data volume [GByte]  1.E-09*(PMC0+PMC1)*64
+Memory bandwidth [MByte/s]  1.E-06*((PMC0+PMC1)*64)/time
+
+LONG
+Formulas:
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+Local DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_0_LOCAL_DRAM*64
+Local DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_0_LOCAL_DRAM*64)/time
+Remote DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_1_REMOTE_DRAM*64
+Remote DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_1_REMOTE_DRAM*64)/time
+Memory data volume [GByte] = 1.E-09*(OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64
+Memory bandwidth [MByte/s] = 1.E-06*((OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64)/time
+--
+This performance group measures the data traffic of CPU cores to local and remote
+memory.
diff --git a/collectors/likwid/groups/westmereEX/TLB_DATA.txt b/collectors/likwid/groups/westmereEX/TLB_DATA.txt
new file mode 100644
index 0000000..d256b8c
--- /dev/null
+++ b/collectors/likwid/groups/westmereEX/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L2 data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_ANY
+PMC1  DTLB_MISSES_ANY
+PMC2  DTLB_LOAD_MISSES_WALK_CYCLES
+PMC3  DTLB_MISSES_WALK_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses     (PMC1-PMC0)
+L1 DTLB store miss rate  (PMC1-PMC0)/FIXC0
+L1 DTLB store miss duration [Cyc] (PMC3-PMC2)/(PMC1-PMC0)
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_ANY
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_ANY / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_CYCLES / DTLB_LOAD_MISSES_ANY
+L1 DTLB store misses = DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY
+L1 DTLB store miss rate = (DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY) / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = (DTLB_MISSES_WALK_CYCLES-DTLB_LOAD_MISSES_WALK_CYCLES) / (DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY)
+-
+The DTLB miss rate gives a measure how often a TLB miss occurred
+per instruction. The store miss calculations are done using ALL-LOADS TLB walks.
+
diff --git a/collectors/likwid/groups/westmereEX/TLB_INSTR.txt b/collectors/likwid/groups/westmereEX/TLB_INSTR.txt
new file mode 100644
index 0000000..2f0f90c
--- /dev/null
+++ b/collectors/likwid/groups/westmereEX/TLB_INSTR.txt
@@ -0,0 +1,27 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_ANY
+PMC1  ITLB_MISSES_WALK_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_ANY
+L1 ITLB miss rate = ITLB_MISSES_ANY / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_CYCLES / ITLB_MISSES_ANY
+-
+The ITLB miss rates gives a measure how often a TLB miss occurred
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/collectors/likwid/groups/westmereEX/UOPS.txt b/collectors/likwid/groups/westmereEX/UOPS.txt
new file mode 100644
index 0000000..f58fda6
--- /dev/null
+++ b/collectors/likwid/groups/westmereEX/UOPS.txt
@@ -0,0 +1,32 @@
+SHORT UOPs execution info
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  UOPS_ISSUED_ANY
+PMC2  UOPS_RETIRED_ANY
+PMC3  UOPS_ISSUED_FUSED
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Issued UOPs PMC0
+Merged UOPs PMC3
+Retired UOPs PMC2
+
+LONG
+Formulas:
+Issued UOPs = UOPS_ISSUED_ANY
+Merged UOPs = UOPS_ISSUED_FUSED
+Retired UOPs = UOPS_RETIRED_ANY
+-
+This group returns information about the instruction pipeline. It measures the
+issued, executed and retired uOPs and returns the number of uOPs which were issued
+but not executed as well as the number of uOPs which were executed but never retired.
+The executed but not retired uOPs commonly come from speculatively executed branches.
+
diff --git a/collectors/likwid/groups/zen/BRANCH.txt b/collectors/likwid/groups/zen/BRANCH.txt
new file mode 100644
index 0000000..dbaf07f
--- /dev/null
+++ b/collectors/likwid/groups/zen/BRANCH.txt
@@ -0,0 +1,32 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  RETIRED_BRANCH_INSTR
+PMC3  RETIRED_MISP_BRANCH_INSTR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI   PMC1/PMC0
+Branch rate   PMC2/PMC0
+Branch misprediction rate  PMC3/PMC0
+Branch misprediction ratio  PMC3/PMC2
+Instructions per branch  PMC0/PMC2
+
+LONG
+Formulas:
+Branch rate = RETIRED_BRANCH_INSTR/RETIRED_INSTRUCTIONS
+Branch misprediction rate = RETIRED_MISP_BRANCH_INSTR/RETIRED_INSTRUCTIONS
+Branch misprediction ratio = RETIRED_MISP_BRANCH_INSTR/RETIRED_BRANCH_INSTR
+Instructions per branch = RETIRED_INSTRUCTIONS/RETIRED_BRANCH_INSTR
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/zen/CACHE.txt b/collectors/likwid/groups/zen/CACHE.txt
new file mode 100644
index 0000000..b773e5a
--- /dev/null
+++ b/collectors/likwid/groups/zen/CACHE.txt
@@ -0,0 +1,39 @@
+SHORT Data cache miss rate/ratio
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  DATA_CACHE_ACCESSES
+PMC3  DATA_CACHE_REFILLS_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI   PMC1/PMC0
+data cache requests PMC2
+data cache request rate PMC2/PMC0
+data cache misses PMC3
+data cache miss rate PMC3/PMC0
+data cache miss ratio PMC3/PMC2
+
+LONG
+Formulas:
+data cache requests = DATA_CACHE_ACCESSES
+data cache request rate = DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS
+data cache misses = DATA_CACHE_REFILLS_ALL
+data cache miss rate = DATA_CACHE_REFILLS_ALL / RETIRED_INSTRUCTIONS
+data cache miss ratio = DATA_CACHE_REFILLS_ALL / DATA_CACHE_ACCESSES
+-
+This group measures the locality of your data accesses with regard to the
+L1 cache. Data cache request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The data cache miss rate gives a measure how often it was necessary to get
+cache lines from higher levels of the memory hierarchy. And finally
+data cache miss ratio tells you how many of your memory references required
+a cache line to be loaded from a higher level. While the# data cache miss rate
+might be given by your algorithm you should try to get data cache miss ratio
+as low as possible by increasing your cache reuse.
+
diff --git a/collectors/likwid/groups/zen/CPI.txt b/collectors/likwid/groups/zen/CPI.txt
new file mode 100644
index 0000000..23e4f8c
--- /dev/null
+++ b/collectors/likwid/groups/zen/CPI.txt
@@ -0,0 +1,30 @@
+SHORT  Cycles per instruction
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  RETIRED_UOPS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   PMC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI PMC1/PMC0
+CPI (based on uops)   PMC1/PMC2
+IPC PMC0/PMC1
+
+
+LONG
+Formulas:
+CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
+CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS
+IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED
+-
+This group measures how efficient the processor works with
+regard to instruction throughput. Also important as a standalone
+metric is RETIRED_INSTRUCTIONS as it tells you how many instruction
+you need to execute for a task. An optimization might show very
+low CPI values but execute many more instruction for it.
+
diff --git a/collectors/likwid/groups/zen/DATA.txt b/collectors/likwid/groups/zen/DATA.txt
new file mode 100644
index 0000000..e061b90
--- /dev/null
+++ b/collectors/likwid/groups/zen/DATA.txt
@@ -0,0 +1,23 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  LS_DISPATCH_LOADS
+PMC3  LS_DISPATCH_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI   PMC1/PMC0
+Load to store ratio PMC2/PMC3
+
+LONG
+Formulas:
+Load to store ratio = LS_DISPATCH_LOADS/LS_DISPATCH_STORES
+-
+This is a simple metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/zen/DIVIDE.txt b/collectors/likwid/groups/zen/DIVIDE.txt
new file mode 100644
index 0000000..c98500b
--- /dev/null
+++ b/collectors/likwid/groups/zen/DIVIDE.txt
@@ -0,0 +1,26 @@
+SHORT Divide unit information
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  DIV_OP_COUNT
+PMC3  DIV_BUSY_CYCLES
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI PMC1/PMC0
+Number of divide ops PMC2
+Avg. divide unit usage duration PMC3/PMC2
+
+LONG
+Formulas:
+CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
+Number of divide ops = DIV_OP_COUNT
+Avg. divide unit usage duration = DIV_BUSY_CYCLES/DIV_OP_COUNT
+-
+This performance group measures the average latency of divide operations
diff --git a/collectors/likwid/groups/zen/ENERGY.txt b/collectors/likwid/groups/zen/ENERGY.txt
new file mode 100644
index 0000000..f58c5b1
--- /dev/null
+++ b/collectors/likwid/groups/zen/ENERGY.txt
@@ -0,0 +1,32 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PWR0  RAPL_CORE_ENERGY
+PWR1  RAPL_PKG_ENERGY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI   PMC1/PMC0
+Energy Core [J]  PWR0
+Power Core [W] PWR0/time
+Energy PKG [J]  PWR1
+Power PKG [W] PWR1/time
+
+LONG
+Formulas:
+Power Core [W] = RAPL_CORE_ENERGY/time
+Power PKG [W] = RAPL_PKG_ENERGY/time
+-
+Ryzen implements the RAPL interface previously introduced by Intel.
+This interface enables to monitor the consumed energy on the core and package
+domain.
+It is not documented by AMD which parts of the CPU are in which domain.
+
diff --git a/collectors/likwid/groups/zen/FLOPS_DP.txt b/collectors/likwid/groups/zen/FLOPS_DP.txt
new file mode 100644
index 0000000..420f942
--- /dev/null
+++ b/collectors/likwid/groups/zen/FLOPS_DP.txt
@@ -0,0 +1,26 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  RETIRED_SSE_AVX_FLOPS_DOUBLE_ALL
+PMC3  MERGE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI   PMC1/PMC0
+DP [MFLOP/s]   1.0E-06*(PMC2)/time
+
+LONG
+Formulas:
+CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
+DP [MFLOP/s] = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_DOUBLE_ALL)/time
+-
+Profiling group to measure double precisision FLOP rate. The event might
+have a higher per-cycle increment than 15, so the MERGE event is required.
+
+
diff --git a/collectors/likwid/groups/zen/FLOPS_SP.txt b/collectors/likwid/groups/zen/FLOPS_SP.txt
new file mode 100644
index 0000000..1f64af1
--- /dev/null
+++ b/collectors/likwid/groups/zen/FLOPS_SP.txt
@@ -0,0 +1,26 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  RETIRED_SSE_AVX_FLOPS_SINGLE_ALL
+PMC3  MERGE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI   PMC1/PMC0
+SP [MFLOP/s]   1.0E-06*(PMC2)/time
+
+LONG
+Formulas:
+CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
+SP [MFLOP/s] = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_SINGLE_ALL)/time
+-
+Profiling group to measure single precisision FLOP rate. The event might
+have a higher per-cycle increment than 15, so the MERGE event is required.
+
+
diff --git a/collectors/likwid/groups/zen/ICACHE.txt b/collectors/likwid/groups/zen/ICACHE.txt
new file mode 100644
index 0000000..f98c28a
--- /dev/null
+++ b/collectors/likwid/groups/zen/ICACHE.txt
@@ -0,0 +1,28 @@
+SHORT Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  ICACHE_FETCHES
+PMC2  ICACHE_L2_REFILLS
+PMC3  ICACHE_SYSTEM_REFILLS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI   FIXC1/PMC0
+L1I request rate   PMC1/PMC0
+L1I miss rate    (PMC2+PMC3)/PMC0
+L1I miss ratio   (PMC2+PMC3)/PMC1
+
+LONG
+Formulas:
+L1I request rate = ICACHE_FETCHES / RETIRED_INSTRUCTIONS
+L1I miss rate = (ICACHE_L2_REFILLS + ICACHE_SYSTEM_REFILLS)/RETIRED_INSTRUCTIONS
+L1I miss ratio = (ICACHE_L2_REFILLS + ICACHE_SYSTEM_REFILLS)/ICACHE_FETCHES
+-
+This group measures the locality of your instruction code with regard to the
+L1 I-Cache.
+
diff --git a/collectors/likwid/groups/zen/L2.txt b/collectors/likwid/groups/zen/L2.txt
new file mode 100644
index 0000000..420e34d
--- /dev/null
+++ b/collectors/likwid/groups/zen/L2.txt
@@ -0,0 +1,28 @@
+SHORT L2 cache bandwidth in MBytes/s (experimental)
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC3  REQUESTS_TO_L2_GRP1_ALL_NO_PF
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  PMC1/PMC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC3*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC3)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC3)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*REQUESTS_TO_L2_GRP1_ALL_NO_PF*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*REQUESTS_TO_L2_GRP1_ALL_NO_PF*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(REQUESTS_TO_L2_GRP1_ALL_NO_PF)*64/time
+L2 data volume [GBytes] = 1.0E-09*(REQUESTS_TO_L2_GRP1_ALL_NO_PF)*64
+-
+Profiling group to measure L2 cache bandwidth. There is no way to measure
+the store traffic between L1 and L2.
diff --git a/collectors/likwid/groups/zen/L3.txt b/collectors/likwid/groups/zen/L3.txt
new file mode 100644
index 0000000..6fe808a
--- /dev/null
+++ b/collectors/likwid/groups/zen/L3.txt
@@ -0,0 +1,32 @@
+SHORT L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+CPMC0 L3_ACCESS
+CPMC1 L3_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  PMC1/PMC0
+L3 access bandwidth [MBytes/s]  1.0E-06*CPMC0*64.0/time
+L3 access data volume [GBytes]  1.0E-09*CPMC0*64.0
+L3 access rate [%] (CPMC0/PMC0)*100.0
+L3 miss rate [%] (CPMC1/PMC0)*100.0
+L3 miss ratio [%] (CPMC1/CPMC0)*100.0
+
+LONG
+Formulas:
+L3 access bandwidth [MBytes/s] = 1.0E-06*L3_ACCESS*64.0/time
+L3 access data volume [GBytes] = 1.0E-09*L3_ACCESS*64.0
+L3 access rate [%] = (L3_ACCESS/RETIRED_INSTRUCTIONS)*100
+L3 miss rate [%] = (L3_MISS/RETIRED_INSTRUCTIONS)*100
+L3 miss ratio [%]= (L3_MISS/L3_ACCESS)*100
+-
+Profiling group to measure L3 cache bandwidth. There is no way to measure
+the store traffic between L2 and L3. The only two published L3 events are
+L3_ACCESS and L3_MISS.
diff --git a/collectors/likwid/groups/zen/MEM.txt b/collectors/likwid/groups/zen/MEM.txt
new file mode 100644
index 0000000..36ff58f
--- /dev/null
+++ b/collectors/likwid/groups/zen/MEM.txt
@@ -0,0 +1,32 @@
+SHORT Main memory bandwidth in MBytes/s (experimental)
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+DFC0  DATA_FROM_LOCAL_DRAM_CHANNEL
+DFC1  DATA_TO_LOCAL_DRAM_CHANNEL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  PMC1/PMC0
+Memory bandwidth [MBytes/s] 1.0E-06*(DFC0+DFC1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(DFC0+DFC1)*64.0
+
+LONG
+Formulas:
+Memory bandwidth [MBytes/s] = 1.0E-06*(DATA_FROM_LOCAL_DRAM_CHANNEL+DATA_TO_LOCAL_DRAM_CHANNEL)*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(DATA_FROM_LOCAL_DRAM_CHANNEL+DATA_TO_LOCAL_DRAM_CHANNEL)*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on a
+per socket base.
+The group provides almost accurate results for the total bandwidth and data volume.
+AMD describes this metric as "approximate" in the documentation for AMD Rome.
+
+Be aware that despite the events imply a traffic direction (FROM and TO), the events
+cannot be used to differentiate between read and write traffic. The events will be
+renamed to avoid that confusion in the future.
diff --git a/collectors/likwid/groups/zen/MEM_DP.txt b/collectors/likwid/groups/zen/MEM_DP.txt
new file mode 100644
index 0000000..9c2fe38
--- /dev/null
+++ b/collectors/likwid/groups/zen/MEM_DP.txt
@@ -0,0 +1,39 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  RETIRED_SSE_AVX_FLOPS_DOUBLE_ALL
+PMC3  MERGE
+DFC0  DATA_FROM_LOCAL_DRAM_CHANNEL
+DFC1  DATA_TO_LOCAL_DRAM_CHANNEL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  PMC1/PMC0
+DP [MFLOP/s]   1.0E-06*(PMC2)/time
+Memory bandwidth [MBytes/s] 1.0E-06*(DFC0+DFC1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(DFC0+DFC1)*64.0
+Operational intensity PMC2/((DFC0+DFC1)*64.0)
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_DOUBLE_ALL)/time
+Memory bandwidth [MBytes/s] = 1.0E-06*(DATA_FROM_LOCAL_DRAM_CHANNEL+DATA_TO_LOCAL_DRAM_CHANNEL)*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(DATA_FROM_LOCAL_DRAM_CHANNEL+DATA_TO_LOCAL_DRAM_CHANNEL)*64.0
+Operational intensity = RETIRED_SSE_AVX_FLOPS_DOUBLE_ALL/((DATA_FROM_LOCAL_DRAM_CHANNEL+DATA_TO_LOCAL_DRAM_CHANNEL)*64.0)
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on a
+per socket base.
+The group provides almost accurate results for the total bandwidth and data volume.
+AMD describes this metric as "approximate" in the documentation for AMD Rome.
+
+Be aware that despite the events imply a traffic direction (FROM and TO), the events
+cannot be used to differentiate between read and write traffic. The events will be
+renamed to avoid that confusion in the future.
+
diff --git a/collectors/likwid/groups/zen/MEM_SP.txt b/collectors/likwid/groups/zen/MEM_SP.txt
new file mode 100644
index 0000000..48ce75c
--- /dev/null
+++ b/collectors/likwid/groups/zen/MEM_SP.txt
@@ -0,0 +1,39 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  RETIRED_SSE_AVX_FLOPS_SINGLE_ALL
+PMC3  MERGE
+DFC0  DATA_FROM_LOCAL_DRAM_CHANNEL
+DFC1  DATA_TO_LOCAL_DRAM_CHANNEL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  PMC1/PMC0
+SP [MFLOP/s]   1.0E-06*(PMC2)/time
+Memory bandwidth [MBytes/s] 1.0E-06*(DFC0+DFC1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(DFC0+DFC1)*64.0
+Operational intensity PMC2/((DFC0+DFC1)*64.0)
+
+LONG
+Formulas:
+DP [MFLOP/s] = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_DOUBLE_ALL)/time
+Memory bandwidth [MBytes/s] = 1.0E-06*(DATA_FROM_LOCAL_DRAM_CHANNEL+DATA_TO_LOCAL_DRAM_CHANNEL)*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(DATA_FROM_LOCAL_DRAM_CHANNEL+DATA_TO_LOCAL_DRAM_CHANNEL)*64.0
+Operational intensity = RETIRED_SSE_AVX_FLOPS_SINGLE_ALL/((DATA_FROM_LOCAL_DRAM_CHANNEL+DATA_TO_LOCAL_DRAM_CHANNEL)*64.0)
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on a
+per socket base.
+The group provides almost accurate results for the total bandwidth and data volume.
+AMD describes this metric as "approximate" in the documentation for AMD Rome.
+
+Be aware that despite the events imply a traffic direction (FROM and TO), the events
+cannot be used to differentiate between read and write traffic. The events will be
+renamed to avoid that confusion in the future.
+
diff --git a/collectors/likwid/groups/zen/NUMA.txt b/collectors/likwid/groups/zen/NUMA.txt
new file mode 100644
index 0000000..19ccdc1
--- /dev/null
+++ b/collectors/likwid/groups/zen/NUMA.txt
@@ -0,0 +1,35 @@
+SHORT L2 cache bandwidth in MBytes/s (experimental)
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  DATA_CACHE_REFILLS_LOCAL_ALL
+PMC1  DATA_CACHE_REFILLS_REMOTE_ALL
+PMC2  HWPREF_DATA_CACHE_FILLS_LOCAL_ALL
+PMC3  HWPREF_DATA_CACHE_FILLS_REMOTE_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  PMC1/PMC0
+Local bandwidth [MBytes/s]  1.0E-06*(PMC0+PMC2)*64.0/time
+Local data volume [GBytes]  1.0E-09*(PMC0+PMC2)*64.0
+Remote bandwidth [MBytes/s]  1.0E-06*(PMC1+PMC3)*64.0/time
+Remote data volume [GBytes]  1.0E-09*(PMC1+PMC3)*64.0
+Total bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC2+PMC1+PMC3)*64.0/time
+Total data volume [GBytes] 1.0E-09*(PMC0+PMC2+PMC1+PMC3)*64.0
+
+LONG
+Formulas:
+Local bandwidth [MBytes/s] = 1.0E-06*(DATA_CACHE_REFILLS_LOCAL_ALL+HWPREF_DATA_CACHE_FILLS_LOCAL_ALL)*64.0/time
+Local data volume [GBytes] = 1.0E-09*(DATA_CACHE_REFILLS_LOCAL_ALL+HWPREF_DATA_CACHE_FILLS_LOCAL_ALL)*64.0
+Remote bandwidth [MBytes/s] = 1.0E-06*(DATA_CACHE_REFILLS_REMOTE_ALL+HWPREF_DATA_CACHE_FILLS_REMOTE_ALL)*64.0/time
+Remote data volume [GBytes] = 1.0E-09*(DATA_CACHE_REFILLS_REMOTE_ALL+HWPREF_DATA_CACHE_FILLS_REMOTE_ALL)*64.0
+Total bandwidth [MBytes/s] = 1.0E-06*(DATA_CACHE_REFILLS_LOCAL_ALL+HWPREF_DATA_CACHE_FILLS_LOCAL_ALL+DATA_CACHE_REFILLS_REMOTE_ALL+HWPREF_DATA_CACHE_FILLS_REMOTE_ALL)*64.0/time
+Total data volume [GBytes] = 1.0E-09*(DATA_CACHE_REFILLS_LOCAL_ALL+HWPREF_DATA_CACHE_FILLS_LOCAL_ALL+DATA_CACHE_REFILLS_REMOTE_ALL+HWPREF_DATA_CACHE_FILLS_REMOTE_ALL)*64.0
+-
+Profiling group to measure NUMA traffic. The data sources range from
+local L2, CCX and memory for the local metrics and remote CCX and memory
+for the remote metrics. There are also events that measure the software
+prefetches from local and remote domain but AMD Zen provides only 4 counters.
diff --git a/collectors/likwid/groups/zen/TLB.txt b/collectors/likwid/groups/zen/TLB.txt
new file mode 100644
index 0000000..510284b
--- /dev/null
+++ b/collectors/likwid/groups/zen/TLB.txt
@@ -0,0 +1,39 @@
+SHORT  TLB miss rate/ratio
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  DATA_CACHE_ACCESSES
+PMC2  L1_DTLB_MISS_ANY_L2_HIT
+PMC3  L1_DTLB_MISS_ANY_L2_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI   FIXC1/PMC0
+L1 DTLB request rate  PMC1/PMC0
+L1 DTLB miss rate   (PMC2+PMC3)/PMC0
+L1 DTLB miss ratio   (PMC2+PMC3)/PMC1
+L2 DTLB request rate   (PMC2+PMC3)/PMC0
+L2 DTLB miss rate    PMC3/PMC0
+L2 DTLB miss ratio    PMC3/(PMC2+PMC3)
+
+
+LONG
+Formulas:
+L1 DTLB request rate = DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS
+L1 DTLB miss rate = (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS)/RETIRED_INSTRUCTIONS
+L1 DTLB miss ratio = (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS)/DATA_CACHE_ACCESSES
+L2 DTLB request rate = (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS)/RETIRED_INSTRUCTIONS
+L2 DTLB miss rate = L1_DTLB_MISS_ANY_L2_MISS / RETIRED_INSTRUCTIONS
+L2 DTLB miss ratio = L1_DTLB_MISS_ANY_L2_MISS / (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS)
+-
+L1 DTLB request  rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The DTLB miss  rate gives a measure how often a TLB miss occurred
+per instruction. And finally L1 DTLB  miss ratio tells you how many
+of your memory references required caused a TLB miss on average.
+NOTE: The L2 metrics are only relevant if L2 DTLB request rate is
+equal to the L1 DTLB miss rate!
diff --git a/collectors/likwid/groups/zen2/BRANCH.txt b/collectors/likwid/groups/zen2/BRANCH.txt
new file mode 100644
index 0000000..dbaf07f
--- /dev/null
+++ b/collectors/likwid/groups/zen2/BRANCH.txt
@@ -0,0 +1,32 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  RETIRED_BRANCH_INSTR
+PMC3  RETIRED_MISP_BRANCH_INSTR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI   PMC1/PMC0
+Branch rate   PMC2/PMC0
+Branch misprediction rate  PMC3/PMC0
+Branch misprediction ratio  PMC3/PMC2
+Instructions per branch  PMC0/PMC2
+
+LONG
+Formulas:
+Branch rate = RETIRED_BRANCH_INSTR/RETIRED_INSTRUCTIONS
+Branch misprediction rate = RETIRED_MISP_BRANCH_INSTR/RETIRED_INSTRUCTIONS
+Branch misprediction ratio = RETIRED_MISP_BRANCH_INSTR/RETIRED_BRANCH_INSTR
+Instructions per branch = RETIRED_INSTRUCTIONS/RETIRED_BRANCH_INSTR
+-
+The rates state how often on average a branch or a mispredicted branch occurred
+per instruction retired in total. The branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/branch rate.
+
diff --git a/collectors/likwid/groups/zen2/CACHE.txt b/collectors/likwid/groups/zen2/CACHE.txt
new file mode 100644
index 0000000..b773e5a
--- /dev/null
+++ b/collectors/likwid/groups/zen2/CACHE.txt
@@ -0,0 +1,39 @@
+SHORT Data cache miss rate/ratio
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  DATA_CACHE_ACCESSES
+PMC3  DATA_CACHE_REFILLS_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI   PMC1/PMC0
+data cache requests PMC2
+data cache request rate PMC2/PMC0
+data cache misses PMC3
+data cache miss rate PMC3/PMC0
+data cache miss ratio PMC3/PMC2
+
+LONG
+Formulas:
+data cache requests = DATA_CACHE_ACCESSES
+data cache request rate = DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS
+data cache misses = DATA_CACHE_REFILLS_ALL
+data cache miss rate = DATA_CACHE_REFILLS_ALL / RETIRED_INSTRUCTIONS
+data cache miss ratio = DATA_CACHE_REFILLS_ALL / DATA_CACHE_ACCESSES
+-
+This group measures the locality of your data accesses with regard to the
+L1 cache. Data cache request rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The data cache miss rate gives a measure how often it was necessary to get
+cache lines from higher levels of the memory hierarchy. And finally
+data cache miss ratio tells you how many of your memory references required
+a cache line to be loaded from a higher level. While the# data cache miss rate
+might be given by your algorithm you should try to get data cache miss ratio
+as low as possible by increasing your cache reuse.
+
diff --git a/collectors/likwid/groups/zen2/CPI.txt b/collectors/likwid/groups/zen2/CPI.txt
new file mode 100644
index 0000000..23e4f8c
--- /dev/null
+++ b/collectors/likwid/groups/zen2/CPI.txt
@@ -0,0 +1,30 @@
+SHORT  Cycles per instruction
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  RETIRED_UOPS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   PMC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI PMC1/PMC0
+CPI (based on uops)   PMC1/PMC2
+IPC PMC0/PMC1
+
+
+LONG
+Formulas:
+CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
+CPI (based on uops) = CPU_CLOCKS_UNHALTED/RETIRED_UOPS
+IPC = RETIRED_INSTRUCTIONS/CPU_CLOCKS_UNHALTED
+-
+This group measures how efficient the processor works with
+regard to instruction throughput. Also important as a standalone
+metric is RETIRED_INSTRUCTIONS as it tells you how many instruction
+you need to execute for a task. An optimization might show very
+low CPI values but execute many more instruction for it.
+
diff --git a/collectors/likwid/groups/zen2/DATA.txt b/collectors/likwid/groups/zen2/DATA.txt
new file mode 100644
index 0000000..e061b90
--- /dev/null
+++ b/collectors/likwid/groups/zen2/DATA.txt
@@ -0,0 +1,23 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  LS_DISPATCH_LOADS
+PMC3  LS_DISPATCH_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI   PMC1/PMC0
+Load to store ratio PMC2/PMC3
+
+LONG
+Formulas:
+Load to store ratio = LS_DISPATCH_LOADS/LS_DISPATCH_STORES
+-
+This is a simple metric to determine your load to store ratio.
+
diff --git a/collectors/likwid/groups/zen2/DIVIDE.txt b/collectors/likwid/groups/zen2/DIVIDE.txt
new file mode 100644
index 0000000..13d629b
--- /dev/null
+++ b/collectors/likwid/groups/zen2/DIVIDE.txt
@@ -0,0 +1,25 @@
+SHORT Divide unit information
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  DIV_OP_COUNT
+PMC3  DIV_BUSY_CYCLES
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI PMC1/PMC0
+Number of divide ops PMC2
+Avg. divide unit usage duration PMC3/PMC2
+
+LONG
+Formulas:
+Number of divide ops = DIV_OP_COUNT
+Avg. divide unit usage duration = DIV_BUSY_CYCLES/DIV_OP_COUNT
+--
+This performance group measures the average latency of divide operations
diff --git a/collectors/likwid/groups/zen2/ENERGY.txt b/collectors/likwid/groups/zen2/ENERGY.txt
new file mode 100644
index 0000000..f58c5b1
--- /dev/null
+++ b/collectors/likwid/groups/zen2/ENERGY.txt
@@ -0,0 +1,32 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PWR0  RAPL_CORE_ENERGY
+PWR1  RAPL_PKG_ENERGY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI   PMC1/PMC0
+Energy Core [J]  PWR0
+Power Core [W] PWR0/time
+Energy PKG [J]  PWR1
+Power PKG [W] PWR1/time
+
+LONG
+Formulas:
+Power Core [W] = RAPL_CORE_ENERGY/time
+Power PKG [W] = RAPL_PKG_ENERGY/time
+-
+Ryzen implements the RAPL interface previously introduced by Intel.
+This interface enables to monitor the consumed energy on the core and package
+domain.
+It is not documented by AMD which parts of the CPU are in which domain.
+
diff --git a/collectors/likwid/groups/zen2/FLOPS_DP.txt b/collectors/likwid/groups/zen2/FLOPS_DP.txt
new file mode 100644
index 0000000..740acb9
--- /dev/null
+++ b/collectors/likwid/groups/zen2/FLOPS_DP.txt
@@ -0,0 +1,28 @@
+SHORT Double Precision MFLOP/s
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  RETIRED_SSE_AVX_FLOPS_ALL
+PMC3  MERGE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI   PMC1/PMC0
+DP [MFLOP/s]   1.0E-06*(PMC2)/time
+
+LONG
+Formulas:
+CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
+DP [MFLOP/s] = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_ALL)/time
+-
+Profiling group to measure (double-precisision) FLOP rate. The event might
+have a higher per-cycle increment than 15, so the MERGE event is required. In
+contrast to AMD Zen, the Zen2 microarchitecture does not provide events to
+differentiate between single- and double-precision.
+
+
diff --git a/collectors/likwid/groups/zen2/FLOPS_SP.txt b/collectors/likwid/groups/zen2/FLOPS_SP.txt
new file mode 100644
index 0000000..0d25aeb
--- /dev/null
+++ b/collectors/likwid/groups/zen2/FLOPS_SP.txt
@@ -0,0 +1,28 @@
+SHORT Single Precision MFLOP/s
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  RETIRED_SSE_AVX_FLOPS_ALL
+PMC3  MERGE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI   PMC1/PMC0
+SP [MFLOP/s]   1.0E-06*(PMC2)/time
+
+LONG
+Formulas:
+CPI = CPU_CLOCKS_UNHALTED/RETIRED_INSTRUCTIONS
+SP [MFLOP/s] = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_ALL)/time
+-
+Profiling group to measure (single-precisision) FLOP rate. The event might
+have a higher per-cycle increment than 15, so the MERGE event is required. In
+contrast to AMD Zen, the Zen2 microarchitecture does not provide events to
+differentiate between single- and double-precision.
+
+
diff --git a/collectors/likwid/groups/zen2/ICACHE.txt b/collectors/likwid/groups/zen2/ICACHE.txt
new file mode 100644
index 0000000..f98c28a
--- /dev/null
+++ b/collectors/likwid/groups/zen2/ICACHE.txt
@@ -0,0 +1,28 @@
+SHORT Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  ICACHE_FETCHES
+PMC2  ICACHE_L2_REFILLS
+PMC3  ICACHE_SYSTEM_REFILLS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI   FIXC1/PMC0
+L1I request rate   PMC1/PMC0
+L1I miss rate    (PMC2+PMC3)/PMC0
+L1I miss ratio   (PMC2+PMC3)/PMC1
+
+LONG
+Formulas:
+L1I request rate = ICACHE_FETCHES / RETIRED_INSTRUCTIONS
+L1I miss rate = (ICACHE_L2_REFILLS + ICACHE_SYSTEM_REFILLS)/RETIRED_INSTRUCTIONS
+L1I miss ratio = (ICACHE_L2_REFILLS + ICACHE_SYSTEM_REFILLS)/ICACHE_FETCHES
+-
+This group measures the locality of your instruction code with regard to the
+L1 I-Cache.
+
diff --git a/collectors/likwid/groups/zen2/L2.txt b/collectors/likwid/groups/zen2/L2.txt
new file mode 100644
index 0000000..420e34d
--- /dev/null
+++ b/collectors/likwid/groups/zen2/L2.txt
@@ -0,0 +1,28 @@
+SHORT L2 cache bandwidth in MBytes/s (experimental)
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC3  REQUESTS_TO_L2_GRP1_ALL_NO_PF
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  PMC1/PMC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC3*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC3*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC3)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC3)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*REQUESTS_TO_L2_GRP1_ALL_NO_PF*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*REQUESTS_TO_L2_GRP1_ALL_NO_PF*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(REQUESTS_TO_L2_GRP1_ALL_NO_PF)*64/time
+L2 data volume [GBytes] = 1.0E-09*(REQUESTS_TO_L2_GRP1_ALL_NO_PF)*64
+-
+Profiling group to measure L2 cache bandwidth. There is no way to measure
+the store traffic between L1 and L2.
diff --git a/collectors/likwid/groups/zen2/L3.txt b/collectors/likwid/groups/zen2/L3.txt
new file mode 100644
index 0000000..6fe808a
--- /dev/null
+++ b/collectors/likwid/groups/zen2/L3.txt
@@ -0,0 +1,32 @@
+SHORT L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+CPMC0 L3_ACCESS
+CPMC1 L3_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  PMC1/PMC0
+L3 access bandwidth [MBytes/s]  1.0E-06*CPMC0*64.0/time
+L3 access data volume [GBytes]  1.0E-09*CPMC0*64.0
+L3 access rate [%] (CPMC0/PMC0)*100.0
+L3 miss rate [%] (CPMC1/PMC0)*100.0
+L3 miss ratio [%] (CPMC1/CPMC0)*100.0
+
+LONG
+Formulas:
+L3 access bandwidth [MBytes/s] = 1.0E-06*L3_ACCESS*64.0/time
+L3 access data volume [GBytes] = 1.0E-09*L3_ACCESS*64.0
+L3 access rate [%] = (L3_ACCESS/RETIRED_INSTRUCTIONS)*100
+L3 miss rate [%] = (L3_MISS/RETIRED_INSTRUCTIONS)*100
+L3 miss ratio [%]= (L3_MISS/L3_ACCESS)*100
+-
+Profiling group to measure L3 cache bandwidth. There is no way to measure
+the store traffic between L2 and L3. The only two published L3 events are
+L3_ACCESS and L3_MISS.
diff --git a/collectors/likwid/groups/zen2/MEM.txt b/collectors/likwid/groups/zen2/MEM.txt
new file mode 100644
index 0000000..c589640
--- /dev/null
+++ b/collectors/likwid/groups/zen2/MEM.txt
@@ -0,0 +1,35 @@
+SHORT Main memory bandwidth in MBytes/s (experimental)
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+DFC0  DATA_FROM_LOCAL_DRAM_CHANNEL
+DFC1  DATA_TO_LOCAL_DRAM_CHANNEL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  PMC1/PMC0
+Memory bandwidth [MBytes/s] 1.0E-06*(DFC0+DFC1)*(4.0/num_numadomains)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(DFC0+DFC1)*(4.0/num_numadomains)*64.0
+
+LONG
+Formulas:
+Memory bandwidth [MBytes/s] = 4.0E-06*(DATA_FROM_LOCAL_DRAM_CHANNEL+DATA_TO_LOCAL_DRAM_CHANNEL)*(4.0/num_numadomains)*64.0/runtime
+Memory data volume [GBytes] = 4.0E-09*(DATA_FROM_LOCAL_DRAM_CHANNEL+DATA_TO_LOCAL_DRAM_CHANNEL)*(4.0/num_numadomains)*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on Uncore events it is only possible to measure on a
+per socket base.
+The group provides almost accurate results for the total bandwidth
+and data volume.
+The metric formulas contain a correction factor of (4.0/num_numadomains) to
+return the value for all 4 memory controllers in NPS1 mode. This is probably
+a work-around. Requested info from AMD but no answer.
+
+Be aware that despite the events imply a traffic direction (FROM and TO), the events
+cannot be used to differentiate between read and write traffic. The events will be
+renamed to avoid that confusion in the future.
diff --git a/collectors/likwid/groups/zen2/NUMA.txt b/collectors/likwid/groups/zen2/NUMA.txt
new file mode 100644
index 0000000..6cb881a
--- /dev/null
+++ b/collectors/likwid/groups/zen2/NUMA.txt
@@ -0,0 +1,35 @@
+SHORT Local and remote memory accesses (experimental)
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  DATA_CACHE_REFILLS_LOCAL_ALL
+PMC1  DATA_CACHE_REFILLS_REMOTE_ALL
+PMC2  HWPREF_DATA_CACHE_FILLS_LOCAL_ALL
+PMC3  HWPREF_DATA_CACHE_FILLS_REMOTE_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  PMC1/PMC0
+Local bandwidth [MBytes/s]  1.0E-06*(PMC0+PMC2)*64.0/time
+Local data volume [GBytes]  1.0E-09*(PMC0+PMC2)*64.0
+Remote bandwidth [MBytes/s]  1.0E-06*(PMC1+PMC3)*64.0/time
+Remote data volume [GBytes]  1.0E-09*(PMC1+PMC3)*64.0
+Total bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC2+PMC1+PMC3)*64.0/time
+Total data volume [GBytes] 1.0E-09*(PMC0+PMC2+PMC1+PMC3)*64.0
+
+LONG
+Formulas:
+Local bandwidth [MBytes/s] = 1.0E-06*(DATA_CACHE_REFILLS_LOCAL_ALL+HWPREF_DATA_CACHE_FILLS_LOCAL_ALL)*64.0/time
+Local data volume [GBytes] = 1.0E-09*(DATA_CACHE_REFILLS_LOCAL_ALL+HWPREF_DATA_CACHE_FILLS_LOCAL_ALL)*64.0
+Remote bandwidth [MBytes/s] = 1.0E-06*(DATA_CACHE_REFILLS_REMOTE_ALL+HWPREF_DATA_CACHE_FILLS_REMOTE_ALL)*64.0/time
+Remote data volume [GBytes] = 1.0E-09*(DATA_CACHE_REFILLS_REMOTE_ALL+HWPREF_DATA_CACHE_FILLS_REMOTE_ALL)*64.0
+Total bandwidth [MBytes/s] = 1.0E-06*(DATA_CACHE_REFILLS_LOCAL_ALL+HWPREF_DATA_CACHE_FILLS_LOCAL_ALL+DATA_CACHE_REFILLS_REMOTE_ALL+HWPREF_DATA_CACHE_FILLS_REMOTE_ALL)*64.0/time
+Total data volume [GBytes] = 1.0E-09*(DATA_CACHE_REFILLS_LOCAL_ALL+HWPREF_DATA_CACHE_FILLS_LOCAL_ALL+DATA_CACHE_REFILLS_REMOTE_ALL+HWPREF_DATA_CACHE_FILLS_REMOTE_ALL)*64.0
+-
+Profiling group to measure NUMA traffic. The data sources range from
+local L2, CCX and memory for the local metrics and remote CCX and memory
+for the remote metrics. There are also events that measure the software
+prefetches from local and remote domain but AMD Zen provides only 4 counters.
diff --git a/collectors/likwid/groups/zen2/TLB.txt b/collectors/likwid/groups/zen2/TLB.txt
new file mode 100644
index 0000000..510284b
--- /dev/null
+++ b/collectors/likwid/groups/zen2/TLB.txt
@@ -0,0 +1,39 @@
+SHORT  TLB miss rate/ratio
+
+EVENTSET
+FIXC1 ACTUAL_CPU_CLOCK
+FIXC2 MAX_CPU_CLOCK
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  DATA_CACHE_ACCESSES
+PMC2  L1_DTLB_MISS_ANY_L2_HIT
+PMC3  L1_DTLB_MISS_ANY_L2_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s]   FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI   FIXC1/PMC0
+L1 DTLB request rate  PMC1/PMC0
+L1 DTLB miss rate   (PMC2+PMC3)/PMC0
+L1 DTLB miss ratio   (PMC2+PMC3)/PMC1
+L2 DTLB request rate   (PMC2+PMC3)/PMC0
+L2 DTLB miss rate    PMC3/PMC0
+L2 DTLB miss ratio    PMC3/(PMC2+PMC3)
+
+
+LONG
+Formulas:
+L1 DTLB request rate = DATA_CACHE_ACCESSES / RETIRED_INSTRUCTIONS
+L1 DTLB miss rate = (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS)/RETIRED_INSTRUCTIONS
+L1 DTLB miss ratio = (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS)/DATA_CACHE_ACCESSES
+L2 DTLB request rate = (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS)/RETIRED_INSTRUCTIONS
+L2 DTLB miss rate = L1_DTLB_MISS_ANY_L2_MISS / RETIRED_INSTRUCTIONS
+L2 DTLB miss ratio = L1_DTLB_MISS_ANY_L2_MISS / (L1_DTLB_MISS_ANY_L2_HIT+L1_DTLB_MISS_ANY_L2_MISS)
+-
+L1 DTLB request  rate tells you how data intensive your code is
+or how many data accesses you have on average per instruction.
+The DTLB miss  rate gives a measure how often a TLB miss occurred
+per instruction. And finally L1 DTLB  miss ratio tells you how many
+of your memory references required caused a TLB miss on average.
+NOTE: The L2 metrics are only relevant if L2 DTLB request rate is
+equal to the L1 DTLB miss rate!
diff --git a/collectors/likwid/groups/zen3/.empty b/collectors/likwid/groups/zen3/.empty
new file mode 100644
index 0000000..5e965d1
--- /dev/null
+++ b/collectors/likwid/groups/zen3/.empty
@@ -0,0 +1 @@
+There is currently no public documentation for AMD Zen3. This folder is just a placeholder for future performance groups.
diff --git a/collectors/likwid/liblikwid-hwloc.a b/collectors/likwid/liblikwid-hwloc.a
new file mode 100644
index 0000000..09feadd
Binary files /dev/null and b/collectors/likwid/liblikwid-hwloc.a differ
diff --git a/collectors/likwid/liblikwid.a b/collectors/likwid/liblikwid.a
new file mode 100644
index 0000000..a3e223f
Binary files /dev/null and b/collectors/likwid/liblikwid.a differ
diff --git a/collectors/likwid/likwid-marker.h b/collectors/likwid/likwid-marker.h
new file mode 100644
index 0000000..ebf8b89
--- /dev/null
+++ b/collectors/likwid/likwid-marker.h
@@ -0,0 +1,170 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  likwid-marker.h
+ *
+ *      Description:  Header File of likwid Marker API
+ *
+ *      Version:   <VERSION>
+ *      Released:  <DATE>
+ *
+ *      Authors:  Thomas Gruber (tg), thomas.roehl@googlemail.com
+ *
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+#ifndef LIKWID_MARKER_H
+#define LIKWID_MARKER_H
+
+
+/** \addtogroup MarkerAPI Marker API module
+*  @{
+*/
+/*!
+\def LIKWID_MARKER_INIT
+Shortcut for likwid_markerInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_THREADINIT
+Shortcut for likwid_markerThreadInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_REGISTER(regionTag)
+Shortcut for likwid_markerRegisterRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_START(regionTag)
+Shortcut for likwid_markerStartRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_STOP(regionTag)
+Shortcut for likwid_markerStopRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
+Shortcut for likwid_markerGetResults() for \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_SWITCH
+Shortcut for likwid_markerNextGroup() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_RESET(regionTag)
+Shortcut for likwid_markerResetRegion() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_CLOSE
+Shortcut for likwid_markerClose() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/** @}*/
+
+#ifdef LIKWID_PERFMON
+#include <likwid.h>
+#define LIKWID_MARKER_INIT likwid_markerInit()
+#define LIKWID_MARKER_THREADINIT likwid_markerThreadInit()
+#define LIKWID_MARKER_SWITCH likwid_markerNextGroup()
+#define LIKWID_MARKER_REGISTER(regionTag) likwid_markerRegisterRegion(regionTag)
+#define LIKWID_MARKER_START(regionTag) likwid_markerStartRegion(regionTag)
+#define LIKWID_MARKER_STOP(regionTag) likwid_markerStopRegion(regionTag)
+#define LIKWID_MARKER_CLOSE likwid_markerClose()
+#define LIKWID_MARKER_RESET(regionTag) likwid_markerResetRegion(regionTag)
+#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count) likwid_markerGetRegion(regionTag, nevents, events, time, count)
+#else  /* LIKWID_PERFMON */
+#define LIKWID_MARKER_INIT
+#define LIKWID_MARKER_THREADINIT
+#define LIKWID_MARKER_SWITCH
+#define LIKWID_MARKER_REGISTER(regionTag)
+#define LIKWID_MARKER_START(regionTag)
+#define LIKWID_MARKER_STOP(regionTag)
+#define LIKWID_MARKER_CLOSE
+#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
+#define LIKWID_MARKER_RESET(regionTag)
+#endif /* LIKWID_PERFMON */
+
+
+/** \addtogroup NvMarkerAPI NvMarker API module (MarkerAPI for Nvidia GPUs)
+*  @{
+*/
+/*!
+\def LIKWID_NVMARKER_INIT
+Shortcut for likwid_gpuMarkerInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_NVMARKER_THREADINIT
+Shortcut for likwid_gpuMarkerThreadInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_NVMARKER_REGISTER(regionTag)
+Shortcut for likwid_gpuMarkerRegisterRegion() with \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_NVMARKER_START(regionTag)
+Shortcut for likwid_gpuMarkerStartRegion() with \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_NVMARKER_STOP(regionTag)
+Shortcut for likwid_gpuMarkerStopRegion() with \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_NVMARKER_GET(regionTag, ngpus, nevents, events, time, count)
+Shortcut for likwid_gpuMarkerGetRegion() for \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_NVMARKER_SWITCH
+Shortcut for likwid_gpuMarkerNextGroup() if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_NVMARKER_RESET(regionTag)
+Shortcut for likwid_gpuMarkerResetRegion() if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_NVMARKER_CLOSE
+Shortcut for likwid_gpuMarkerClose() if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
+*/
+/** @}*/
+
+#ifdef LIKWID_NVMON
+#ifndef LIKWID_WITH_NVMON
+#define LIKWID_WITH_NVMON
+#endif
+#include <likwid.h>
+#define LIKWID_NVMARKER_INIT likwid_gpuMarkerInit()
+#define LIKWID_NVMARKER_THREADINIT likwid_gpuMarkerThreadInit()
+#define LIKWID_NVMARKER_SWITCH likwid_gpuMarkerNextGroup()
+#define LIKWID_NVMARKER_REGISTER(regionTag) likwid_gpuMarkerRegisterRegion(regionTag)
+#define LIKWID_NVMARKER_START(regionTag) likwid_gpuMarkerStartRegion(regionTag)
+#define LIKWID_NVMARKER_STOP(regionTag) likwid_gpuMarkerStopRegion(regionTag)
+#define LIKWID_NVMARKER_CLOSE likwid_gpuMarkerClose()
+#define LIKWID_NVMARKER_RESET(regionTag) likwid_gpuMarkerResetRegion(regionTag)
+#define LIKWID_NVMARKER_GET(regionTag, ngpus, nevents, events, time, count) \
+    likwid_gpuMarkerGetRegion(regionTag, ngpus, nevents, events, time, count)
+#else /* LIKWID_NVMON */
+#define LIKWID_NVMARKER_INIT
+#define LIKWID_NVMARKER_THREADINIT
+#define LIKWID_NVMARKER_SWITCH
+#define LIKWID_NVMARKER_REGISTER(regionTag)
+#define LIKWID_NVMARKER_START(regionTag)
+#define LIKWID_NVMARKER_STOP(regionTag)
+#define LIKWID_NVMARKER_CLOSE
+#define LIKWID_NVMARKER_GET(regionTag, nevents, events, time, count)
+#define LIKWID_NVMARKER_RESET(regionTag)
+#endif /* LIKWID_NVMON */
+
+
+
+#endif /* LIKWID_MARKER_H */
diff --git a/collectors/likwid/likwid.go b/collectors/likwid/likwid.go
index e0e3b11..f07acef 100644
--- a/collectors/likwid/likwid.go
+++ b/collectors/likwid/likwid.go
@@ -1,4 +1,5 @@
 package main
+
 /*
 #cgo CFLAGS: -I.
 #cgo LDFLAGS: -L. -llikwid -llikwid-hwloc -lm
@@ -11,7 +12,7 @@ import "unsafe"
 
 func main() {
 	var topo C.CpuTopology_t
-	C.topology_init();
+	C.topology_init()
 	topo = C.get_cpuTopology()
 	cpulist := make([]C.int, topo.numHWThreads)
 	for a := 0; a < int(topo.numHWThreads); a++ {
@@ -27,5 +28,5 @@ func main() {
 	fmt.Println(v)
 	C.free(unsafe.Pointer(gstring))
 	C.perfmon_finalize()
-	C.topology_finalize();
+	C.topology_finalize()
 }
diff --git a/collectors/likwid/likwid.h b/collectors/likwid/likwid.h
new file mode 100644
index 0000000..e48e6d7
--- /dev/null
+++ b/collectors/likwid/likwid.h
@@ -0,0 +1,2305 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  likwid.h
+ *
+ *      Description:  Header File of likwid API
+ *
+ *      Version:   <VERSION>
+ *      Released:  <DATE>
+ *
+ *      Authors:  Thomas Gruber (tr), thomas.roehl@googlemail.com
+ *
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+#ifndef LIKWID_H
+#define LIKWID_H
+
+#include <stdint.h>
+#include <errno.h>
+#include <string.h>
+
+#include <bstrlib.h>
+
+#define DEBUGLEV_ONLY_ERROR 0
+#define DEBUGLEV_INFO 1
+#define DEBUGLEV_DETAIL 2
+#define DEBUGLEV_DEVELOP 3
+
+#define LIKWID_VERSION "VERSION.RELEASE.MINORVERSION"
+#define LIKWID_COMMIT GITCOMMIT
+
+extern int perfmon_verbosity;
+extern int likwid_nvmon_verbosity;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef LIKWID_MARKER_INIT
+#include <likwid-marker.h>
+#endif
+
+/*
+################################################################################
+# Marker API related functions
+################################################################################
+*/
+/** \addtogroup MarkerAPI Marker API module
+*  @{
+*/
+/*! \brief Initialize LIKWID's marker API
+
+Must be called in serial region of the application to set up basic data structures
+of LIKWID.
+Reads environment variables:
+- LIKWID_MODE (access mode)
+- LIKWID_MASK (event bitmask)
+- LIKWID_EVENTS (event string)
+- LIKWID_THREADS (cpu list separated by ,)
+- LIKWID_GROUPS (amount of groups)
+*/
+extern void likwid_markerInit(void) __attribute__ ((visibility ("default") ));
+/*! \brief Initialize LIKWID's marker API for the current thread
+
+Must be called in parallel region of the application to set up basic data structures
+of LIKWID. Before you can call likwid_markerThreadInit() you have to call likwid_markerInit().
+
+*/
+extern void likwid_markerThreadInit(void) __attribute__ ((visibility ("default") ));
+/*! \brief Switch to next group to measure
+
+Should be called in a serial region of code. If it is to be called from inside
+a parallel region, ensure only one thread runs it by using "#pragma omp single"
+or similar. Additionally, if this function is called in a parallel region,
+ensure that the serial regions is preceeded by a barrier ("#pragma omp barrier"
+or similar) to prevent race conditions. 
+*/
+extern void likwid_markerNextGroup(void) __attribute__ ((visibility ("default") ));
+/*! \brief Close LIKWID's marker API
+
+Must be called in serial region of the application. It gathers all data of regions and
+writes them out to a file (filepath in env variable LIKWID_FILEPATH).
+*/
+extern void likwid_markerClose(void) __attribute__ ((visibility ("default") ));
+/*! \brief Register a measurement region
+
+Initializes the hashTable entry in order to reduce execution time of likwid_markerStartRegion()
+@param regionTag [in] Initialize data using this string
+@return Error code
+*/
+extern int likwid_markerRegisterRegion(const char* regionTag) __attribute__ ((visibility ("default") ));
+/*! \brief Start a measurement region
+
+Reads the values of all configured counters and saves the results under the
+name given in regionTag. Must be called on every thread that is to be measured,
+e.g. if the code to be measured is run in a parallel region, this function must
+also be called in a parallel region (typically the same parallel region as the
+measured code). If this function is to be called multiple times in one parallel
+region, place a barrier ("#pragma omp barrier" or similar) before each call to
+likwid_markerStartRegion
+@param regionTag [in] Store data using this string
+@return Error code of start operation
+*/
+extern int likwid_markerStartRegion(const char* regionTag) __attribute__ ((visibility ("default") ));
+/*! \brief Stop a measurement region
+
+Reads the values of all configured counters and saves the results under the
+name given in regionTag. The measurement data of the stopped region gets summed
+up in global region counters. Must be called on every thread that is to be
+measured, e.g. if the code to be measured is run in a parallel region, this
+function must also be called in a parallel region (typically the same parallel
+region as the measured code). If this function is called multiple times in one
+parallel region, place a barrier ("#pragma omp barrier" or similar) after each
+call to likwid_markerStopRegion
+@param regionTag [in] Store data using this string
+@return Error code of stop operation
+*/
+extern int likwid_markerStopRegion(const char* regionTag) __attribute__ ((visibility ("default") ));
+/*! \brief Reset a measurement region
+
+Reset the values of all configured counters and timers.
+@param regionTag [in] Reset data using this string
+@return Error code of reset operation
+*/
+extern int likwid_markerResetRegion(const char* regionTag) __attribute__ ((visibility ("default") ));
+/*! \brief Get accumulated data of a code region
+
+Get the accumulated data of the current thread for the given regionTag.
+@param regionTag [in] Print data using this string
+@param nr_events [in,out] Length of events array
+@param events [out] Events array for the intermediate results
+@param time [out] Accumulated measurement time
+@param count [out] Call count of the code region
+*/
+extern void likwid_markerGetRegion(const char* regionTag, int* nr_events, double* events, double *time, int *count) __attribute__ ((visibility ("default") ));
+/* utility routines */
+/*! \brief Get CPU ID of the current process/thread
+
+Returns the ID of the CPU the current process or thread is running on.
+@return current CPU ID
+*/
+extern int  likwid_getProcessorId() __attribute__ ((visibility ("default") ));
+/*! \brief Pin the current process to given CPU
+
+Pin the current process to the given CPU ID. The process cannot be scheduled to
+another CPU after pinning but the pinning can be changed anytime with this function.
+@param [in] processorId CPU ID to pin the current process to
+@return error code (1 for success, 0 for error)
+*/
+extern int  likwid_pinProcess(int processorId) __attribute__ ((visibility ("default") ));
+/*! \brief Pin the current thread to given CPU
+
+Pin the current thread to the given CPU ID. The thread cannot be scheduled to
+another CPU after pinning but the pinning can be changed anytime with this function
+@param [in] processorId CPU ID to pin the current thread to
+@return error code (1 for success, 0 for error)
+*/
+extern int  likwid_pinThread(int processorId) __attribute__ ((visibility ("default") ));
+/** @}*/
+
+/*
+################################################################################
+# Access client related functions
+################################################################################
+*/
+/** \addtogroup Access Access module
+ *  @{
+ */
+
+/*! \brief Enum for the access modes
+
+LIKWID supports multiple access modes to the MSR and PCI performance monitoring
+registers. For direct access the user must have enough priviledges to access the
+MSR and PCI devices. The daemon mode forwards the operations to a daemon with
+higher priviledges.
+*/
+typedef enum {
+    ACCESSMODE_PERF = -1, /*!< \brief Access performance monitoring through perf_event kernel interface */
+    ACCESSMODE_DIRECT = 0, /*!< \brief Access performance monitoring registers directly */
+    ACCESSMODE_DAEMON = 1 /*!< \brief Use the access daemon to access the registers */
+} AccessMode;
+
+/*! \brief Set access mode
+
+Sets the mode how the MSR and PCI registers should be accessed. 0 for direct access (propably root priviledges required) and 1 for accesses through the access daemon. It must be called before HPMinit()
+@param [in] mode (0=direct, 1=daemon)
+*/
+extern void HPMmode(int mode) __attribute__ ((visibility ("default") ));
+/*! \brief Initialize access module
+
+Initialize the module internals to either the MSR/PCI files or the access daemon
+@return error code (0 for access)
+*/
+extern int HPMinit() __attribute__ ((visibility ("default") ));
+/*! \brief Add CPU to access module
+
+Add the given CPU to the access module. This opens the commnunication to either the MSR/PCI files or the access daemon.
+@param [in] cpu_id CPU that should be enabled for measurements
+@return error code (0 for success, -ENODEV if access cannot be initialized
+*/
+extern int HPMaddThread(int cpu_id) __attribute__ ((visibility ("default") ));
+/*! \brief Close connections
+
+Close the connections to the MSR/PCI files or the access daemon
+*/
+extern void HPMfinalize() __attribute__ ((visibility ("default") ));
+/** @}*/
+
+/*
+################################################################################
+# Config file related functions
+################################################################################
+*/
+/** \addtogroup Config Config file module
+*  @{
+*/
+/*! \brief Structure holding values of the configuration file
+
+LIKWID supports the definition of runtime values in a configuration file. The
+most important configurations in most cases are the path the access daemon and
+the corresponding access mode. In order to avoid reading in the system topology
+at each start, a path to a topology file can be set. The other values are mostly
+used internally.
+*/
+typedef struct {
+    char* configFileName; /*!< \brief Path to the configuration file */
+    char* topologyCfgFileName; /*!< \brief Path to the topology file */
+    char* daemonPath; /*!< \brief Path of the access daemon */
+    char* groupPath; /*!< \brief Path of default performance group directory */
+    AccessMode daemonMode; /*!< \brief Access mode to the MSR and PCI registers */
+    int maxNumThreads; /*!< \brief Maximum number of HW threads */
+    int maxNumNodes; /*!< \brief Maximum number of NUMA nodes */
+} Likwid_Configuration;
+
+/** \brief Pointer for exporting the Configuration data structure */
+typedef Likwid_Configuration* Configuration_t;
+/*! \brief Read the config file of LIKWID, if it exists
+
+Search for LIKWID config file and read the values in
+Currently the paths /usr/local/etc/likwid.cfg, /etc/likwid.cfg and the path
+defined in config.mk are checked.
+@return error code (0 for success, -EFAULT if no file can be found)
+*/
+extern int init_configuration(void) __attribute__ ((visibility ("default") ));
+/*! \brief Destroy the config structure
+
+Destroys the current config structure and frees all allocated memory for path names
+@return error code (0 for success, -EFAULT if config structure not initialized)
+*/
+extern int destroy_configuration(void) __attribute__ ((visibility ("default") ));
+
+
+/*! \brief Retrieve the config structure
+
+Get the initialized configuration
+\sa Configuration_t
+@return Configuration_t (pointer to internal Configuration structure)
+*/
+extern Configuration_t get_configuration(void) __attribute__ ((visibility ("default") ));
+
+/*! \brief Set group path in the config struction
+
+Set group path in the config struction. The path must be a directory.
+@param [in] path
+@return error code (0 for success, -ENOMEM if reallocation failed, -ENOTDIR if no directoy)
+*/
+extern int config_setGroupPath(const char* path) __attribute__ ((visibility ("default") ));
+
+/** @}*/
+/*
+################################################################################
+# CPU topology related functions
+################################################################################
+*/
+/** \addtogroup CPUTopology CPU information module
+*  @{
+*/
+/*! \brief Structure with general CPU information
+
+General information covers CPU family, model, name and current clock and vendor
+specific information like the version of Intel's performance monitoring facility.
+*/
+typedef struct {
+    uint32_t    family; /*!< \brief CPU family ID*/
+    uint32_t    model; /*!< \brief CPU model ID */
+    uint32_t    stepping; /*!< \brief Stepping (version) of the CPU */
+    uint32_t    vendor; /*!< \brief Vendor of the CPU */
+    uint32_t    part; /*!< \brief Part number of the CPU */
+    uint64_t    clock; /*!< \brief Current clock frequency of the executing CPU*/
+    int         turbo; /*!< \brief Flag if CPU has a turbo mode */
+    char*       osname; /*!< \brief Name of the CPU reported by OS */
+    char*       name; /*!< \brief Name of the CPU as identified by LIKWID */
+    char*       short_name; /*!< \brief Short name of the CPU*/
+    char*       features; /*!< \brief String with all features supported by the CPU*/
+    int         isIntel; /*!< \brief Flag if it is an Intel CPU*/
+    char        architecture[20]; /*!< \brief name of the architecture like x86_64 or ppc64 (comparable with uname -m)*/
+    int         supportUncore; /*!< \brief Flag if system has Uncore performance monitors */
+    int         supportClientmem; /*!< \brief Flag if system has mappable memory controllers */
+    uint64_t    featureFlags; /*!< \brief Mask of all features supported by the CPU*/
+    uint32_t    perf_version; /*!< \brief Version of Intel's performance monitoring facility */
+    uint32_t    perf_num_ctr; /*!< \brief Number of general purpose HWthread-local performance monitoring counters */
+    uint32_t    perf_width_ctr; /*!< \brief Bit width of fixed and general purpose counters */
+    uint32_t    perf_num_fixed_ctr; /*!< \brief Number of fixed purpose HWthread-local performance monitoring counters */
+} CpuInfo;
+
+/*! \brief Structure with IDs of a HW thread
+
+For each HW thread this structure stores the ID of the thread inside a CPU, the
+CPU core ID of the HW thread and the CPU socket ID.
+\extends CpuTopology
+*/
+typedef struct {
+    uint32_t threadId; /*!< \brief ID of HW thread inside the CPU core */
+    uint32_t coreId; /*!< \brief ID of CPU core that executes the HW thread */
+    uint32_t packageId; /*!< \brief ID of CPU socket containing the HW thread */
+    uint32_t apicId; /*!< \brief ID of HW thread retrieved through the Advanced Programmable Interrupt Controller */
+    uint32_t inCpuSet; /*!< \brief Flag if HW thread is inside the CPUset */
+} HWThread;
+
+/*! \brief Enum of possible caches
+
+CPU caches can have different tasks and hold different kind of data. This enum lists all shapes used in all supported CPUs
+\extends CacheLevel
+*/
+typedef enum {
+    NOCACHE=0, /*!< \brief No cache used as undef value */
+    DATACACHE, /*!< \brief Cache holding data cache lines */
+    INSTRUCTIONCACHE, /*!< \brief Cache holding instruction cache lines */
+    UNIFIEDCACHE, /*!< \brief Cache holding both instruction and data cache lines */
+    ITLB, /*!< \brief Translation Lookaside Buffer cache for instruction pages */
+    DTLB /*!< \brief Translation Lookaside Buffer cache for data pages */
+} CacheType;
+
+/*! \brief Structure describing a cache level
+
+CPUs are connected to a cache hierarchy with different amount of caches at each level. The CacheLevel structure holds general information about the cache.
+\extends CpuTopology
+*/
+typedef struct {
+    uint32_t level; /*!< \brief Level of the cache in the hierarchy */
+    CacheType type; /*!< \brief Type of the cache */
+    uint32_t associativity; /*!< \brief Amount of cache lines hold by each set */
+    uint32_t sets; /*!< \brief Amount of sets */
+    uint32_t lineSize; /*!< \brief Size in bytes of one cache line */
+    uint32_t size; /*!< \brief Size in bytes of the cache */
+    uint32_t threads; /*!< \brief Number of HW thread connected to the cache */
+    uint32_t inclusive; /*!< \brief Flag if cache is inclusive (holds also cache lines available in caches nearer to the CPU) or exclusive */
+} CacheLevel;
+
+/*! \brief Structure describing the topology of the HW threads in the system
+
+This structure describes the topology at HW thread level like the amount of HW threads, how they are distributed over the CPU sockets/packages and how the caching hierarchy is assembled.
+*/
+typedef struct {
+    uint32_t numHWThreads; /*!< \brief Amount of active HW threads in the system (e.g. in cpuset) */
+    uint32_t activeHWThreads; /*!< \brief Amount of HW threads in the system and length of \a threadPool */
+    uint32_t numSockets; /*!< \brief Amount of CPU sockets/packages in the system */
+    uint32_t numCoresPerSocket; /*!< \brief Amount of physical cores in one CPU socket/package */
+    uint32_t numThreadsPerCore; /*!< \brief Amount of HW threads in one physical CPU core */
+    uint32_t numCacheLevels; /*!< \brief Amount of caches for each HW thread and length of \a cacheLevels */
+    HWThread* threadPool; /*!< \brief List of all HW thread descriptions */
+    CacheLevel*  cacheLevels; /*!< \brief List of all caches in the hierarchy */
+    struct treeNode* topologyTree; /*!< \brief Anchor for a tree structure describing the system topology */
+} CpuTopology;
+
+/*! \brief Variable holding the global cpu information structure */
+extern CpuInfo cpuid_info;
+/*! \brief Variable holding the global cpu topology structure */
+extern CpuTopology cpuid_topology;
+
+/** \brief Pointer for exporting the CpuInfo data structure */
+typedef CpuInfo* CpuInfo_t;
+/** \brief Pointer for exporting the CpuTopology data structure */
+typedef CpuTopology* CpuTopology_t;
+/*! \brief Initialize topology information
+
+CpuInfo_t and CpuTopology_t are initialized by either HWLOC, CPUID/ProcFS or topology file if present. The topology file name can be configured in the configuration file. Furthermore, the paths /etc/likwid_topo.cfg and &lt;PREFIX&gt;/etc/likwid_topo.cfg are checked.
+\sa CpuInfo_t and CpuTopology_t
+@return always 0
+*/
+extern int topology_init(void) __attribute__ ((visibility ("default") ));
+/*! \brief Retrieve CPU topology of the current machine
+
+\sa CpuTopology_t
+@return CpuTopology_t (pointer to internal cpuid_topology structure)
+*/
+extern CpuTopology_t get_cpuTopology(void) __attribute__ ((visibility ("default") ));
+/*! \brief Retrieve CPU information of the current machine
+
+Get the previously initialized CPU info structure containing number of CPUs/Threads
+\sa CpuInfo_t
+@return CpuInfo_t (pointer to internal cpuid_info structure)
+*/
+extern CpuInfo_t get_cpuInfo(void) __attribute__ ((visibility ("default") ));
+/*! \brief Destroy topology structures CpuInfo_t and CpuTopology_t.
+
+Retrieved pointers to the structures are not valid anymore after this function call
+\sa CpuInfo_t and CpuTopology_t
+*/
+extern void topology_finalize(void) __attribute__ ((visibility ("default") ));
+/*! \brief Print all supported architectures
+*/
+extern void print_supportedCPUs(void) __attribute__ ((visibility ("default") ));
+/** @}*/
+/*
+################################################################################
+# NUMA related functions
+################################################################################
+*/
+/** \addtogroup NumaTopology NUMA memory topology module
+ *  @{
+ */
+/*! \brief CPUs in NUMA node and general information about a NUMA domain
+
+The NumaNode structure describes the topology and holds general information of a
+NUMA node. The structure is filled by calling numa_init() by either the HWLOC
+library or by evaluating the /proc filesystem.
+\extends NumaTopology
+*/
+typedef struct {
+    uint32_t id; /*!< \brief ID of the NUMA node */
+    uint64_t totalMemory; /*!< \brief Amount of memory in the NUMA node */
+    uint64_t freeMemory; /*!< \brief Amount of free memory in the NUMA node */
+    uint32_t numberOfProcessors; /*!< \brief umber of processors covered by the NUMA node and length of \a processors */
+    uint32_t*  processors; /*!< \brief List of HW threads in the NUMA node */
+    uint32_t numberOfDistances; /*!< \brief Amount of distances to the other NUMA nodes in the system and self  */
+    uint32_t*  distances; /*!< \brief List of distances to the other NUMA nodes and self */
+} NumaNode;
+
+
+/*! \brief  The NumaTopology structure describes all NUMA nodes in the current system.
+*/
+typedef struct {
+    uint32_t numberOfNodes; /*!< \brief Number of NUMA nodes in the system and length of \a nodes  */
+    NumaNode* nodes; /*!< \brief List of NUMA nodes */
+} NumaTopology;
+
+/*! \brief Variable holding the global NUMA information structure */
+extern NumaTopology numa_info;
+
+/** \brief Pointer for exporting the NumaTopology data structure */
+typedef NumaTopology* NumaTopology_t;
+
+/*! \brief Initialize NUMA information
+
+Initialize NUMA information NumaTopology_t using either HWLOC or CPUID/ProcFS. If
+a topology config file is present it is read at topology_init() and fills \a NumaTopology_t
+\sa NumaTopology_t
+@return error code (0 for success, -1 if initialization failed)
+*/
+extern int numa_init(void) __attribute__ ((visibility ("default") ));
+/*! \brief Retrieve NUMA information of the current machine
+
+Get the previously initialized NUMA info structure
+\sa NumaTopology_t
+@return NumaTopology_t (pointer to internal numa_info structure)
+*/
+extern NumaTopology_t get_numaTopology(void) __attribute__ ((visibility ("default") ));
+/*! \brief Set memory allocation policy to interleaved
+
+Set the memory allocation policy to interleaved for given list of CPUs
+@param [in] processorList List of processors
+@param [in] numberOfProcessors Length of processor list
+*/
+extern void numa_setInterleaved(const int* processorList, int numberOfProcessors) __attribute__ ((visibility ("default") ));
+/*! \brief Allocate memory from a specific specific NUMA node
+@param [in,out] ptr Start pointer of memory
+@param [in] size Size for the allocation
+@param [in] domainId ID of NUMA node for the allocation
+*/
+extern void numa_membind(void* ptr, size_t size, int domainId) __attribute__ ((visibility ("default") ));
+/*! \brief Set memory allocation policy to membind
+
+Set the memory allocation policy to membind for given list of CPUs. This forces
+allocation to be placed in NUMA domains spanning the given processor list.
+@param [in] processorList List of processors
+@param [in] numberOfProcessors Length of processor list
+*/
+extern void numa_setMembind(const int* processorList, int numberOfProcessors) __attribute__ ((visibility ("default") ));
+/*! \brief Destroy NUMA information structure
+
+Destroys the NUMA information structure NumaTopology_t. Retrieved pointers
+to the structures are not valid anymore after this function call
+\sa NumaTopology_t
+*/
+extern void numa_finalize(void) __attribute__ ((visibility ("default") ));
+/*! \brief Retrieve the number of NUMA nodes
+
+Returns the number of NUMA nodes of the current machine. Can also be read out of
+NumaTopology_t
+\sa NumaTopology_t
+@return Number of NUMA nodes
+*/
+extern int likwid_getNumberOfNodes(void) __attribute__ ((visibility ("default") ));
+/** @}*/
+/*
+################################################################################
+# Affinity domains related functions
+################################################################################
+*/
+/** \addtogroup AffinityDomains Thread affinity module
+ *  @{
+ */
+
+/*! \brief The AffinityDomain data structure describes a single domain in the current system
+
+The AffinityDomain data structure describes a single domain in the current system. Example domains are NUMA nodes, CPU sockets/packages or LLC (Last Level Cache) cache domains.
+\extends AffinityDomains
+*/
+typedef struct {
+    bstring tag; /*!< \brief Bstring with the ID for the affinity domain. Currently possible values: N (node), SX (socket/package X), CX (LLC cache domain X) and MX (memory domain X) */
+    uint32_t numberOfProcessors; /*!< \brief Number of HW threads in the domain and length of \a processorList */
+    uint32_t numberOfCores; /*!< \brief Number of hardware threads in the domain */
+    int*  processorList; /*!< \brief List of HW thread IDs in the domain */
+} AffinityDomain;
+
+/*! \brief The AffinityDomains data structure holds different count variables describing the
+various system layers
+
+Affinity domains are for example the amount of NUMA domains, CPU sockets/packages or LLC
+(Last Level Cache) cache domains of the current machine. Moreover a list of
+\a domains holds the processor lists for each domain that are used for
+scheduling processes to domain specific HW threads. Some amounts are duplicates
+or derivation of values in \a CpuInfo, \a CpuTopology and \a NumaTopology.
+*/
+typedef struct {
+    uint32_t numberOfSocketDomains; /*!< \brief Number of CPU sockets/packages in the system */
+    uint32_t numberOfNumaDomains; /*!< \brief Number of NUMA nodes in the system */
+    uint32_t numberOfProcessorsPerSocket; /*!< \brief Number of HW threads per socket/package in the system */
+    uint32_t numberOfCacheDomains; /*!< \brief Number of LLC caches in the system */
+    uint32_t numberOfCoresPerCache; /*!< \brief Number of CPU cores per LLC cache in the system */
+    uint32_t numberOfProcessorsPerCache; /*!< \brief Number of hardware threads per LLC cache in the system */
+    uint32_t numberOfAffinityDomains; /*!< \brief Number of affinity domains in the current system  and length of \a domains array */
+    AffinityDomain* domains; /*!< \brief List of all domains in the system */
+} AffinityDomains;
+
+/** \brief Pointer for exporting the AffinityDomains data structure */
+typedef AffinityDomains* AffinityDomains_t;
+
+/*! \brief Initialize affinity information
+
+Initialize affinity information AffinityDomains_t using the data of the structures
+\a CpuInfo_t, CpuTopology_t and NumaTopology_t
+\sa AffinityDomains_t
+*/
+extern void affinity_init() __attribute__ ((visibility ("default") ));
+/*! \brief Retrieve affinity structure
+
+Get the previously initialized affinity info structure
+\sa AffinityDomains_t
+@return AffinityDomains_t (pointer to internal affinityDomains structure)
+*/
+extern AffinityDomains_t get_affinityDomains(void) __attribute__ ((visibility ("default") ));
+/*! \brief Pin process to a CPU
+
+Pin process to a CPU. Duplicate of likwid_pinProcess()
+@param [in] processorId CPU ID for pinning
+*/
+extern void affinity_pinProcess(int processorId) __attribute__ ((visibility ("default") ));
+/*! \brief Pin processes to a CPU
+
+Pin processes to a CPU. Creates a cpuset with the given processor IDs
+@param [in] cpu_count Number of processors in processorIds
+@param [in] processorIds Array of processor IDs
+*/
+extern void affinity_pinProcesses(int cpu_count, const int* processorIds) __attribute__ ((visibility ("default") ));
+/*! \brief Pin thread to a CPU
+
+Pin thread to a CPU. Duplicate of likwid_pinThread()
+@param [in] processorId CPU ID for pinning
+*/
+extern void affinity_pinThread(int processorId) __attribute__ ((visibility ("default") ));
+/*! \brief Return the CPU ID where the current process runs.
+
+@return CPU ID
+*/
+extern int affinity_processGetProcessorId() __attribute__ ((visibility ("default") ));
+/*! \brief Return the CPU ID where the current thread runs.
+
+@return CPU ID
+*/
+extern int affinity_threadGetProcessorId() __attribute__ ((visibility ("default") ));
+/*! \brief Destroy affinity information structure
+
+Destroys the affinity information structure AffinityDomains_t. Retrieved pointers
+to the structures are not valid anymore after this function call
+\sa AffinityDomains_t
+*/
+extern void affinity_finalize() __attribute__ ((visibility ("default") ));
+/** @}*/
+
+/*
+################################################################################
+# CPU string parsing related functions
+################################################################################
+*/
+/** \addtogroup CPUParse CPU string parser module
+ *  @{
+ */
+
+/*! \brief Read CPU selection string and resolve to available CPU numbers
+
+Reads the CPU selection string and fills the given list with the CPU numbers
+defined in the selection string. This function is a interface function for the
+different selection modes: scatter, expression, logical and physical.
+@param [in] cpustring Selection string
+@param [in,out] cpulist List of CPUs
+@param [in] length Length of cpulist
+@return error code (>0 on success for the returned list length, -ERRORCODE on failure)
+*/
+extern int cpustr_to_cpulist(const char* cpustring, int* cpulist, int length)  __attribute__ ((visibility ("default") ));
+/*! \brief Read NUMA node selection string and resolve to available NUMA node numbers
+
+Reads the NUMA node selection string and fills the given list with the NUMA node numbers
+defined in the selection string.
+@param [in] nodestr Selection string
+@param [out] nodes List of available NUMA nodes
+@param [in] length Length of NUMA node list
+@return error code (>0 on success for the returned list length, -ERRORCODE on failure)
+*/
+extern int nodestr_to_nodelist(const char* nodestr, int* nodes, int length)  __attribute__ ((visibility ("default") ));
+/*! \brief Read CPU socket selection string and resolve to available CPU socket numbers
+
+Reads the CPU socket selection string and fills the given list with the CPU socket numbers
+defined in the selection string.
+@param [in] sockstr Selection string
+@param [out] sockets List of available CPU sockets
+@param [in] length Length of CPU socket list
+@return error code (>0 on success for the returned list length, -ERRORCODE on failure)
+*/
+extern int sockstr_to_socklist(const char* sockstr, int* sockets, int length)  __attribute__ ((visibility ("default") ));
+
+#ifdef LIKWID_WITH_NVMON
+/*! \brief Read GPU selection string and resolve to available GPUs numbers
+
+Reads the GPU selection string and fills the given list with the GPU numbers defined in the selection string.
+@param [in] gpustr Selection string
+@param [out] gpulist List of available GPU
+@param [in] length Length of GPU list
+@return error code (>0 on success for the returned list length, -ERRORCODE on failure)
+*/
+extern int gpustr_to_gpulist(const char* gpustr, int* gpulist, int length)  __attribute__ ((visibility ("default") ));
+
+#endif /* LIKWID_WITH_NVMON */
+
+/** @}*/
+
+/*
+################################################################################
+# Performance monitoring related functions
+################################################################################
+*/
+/** \addtogroup PerfMon Performance monitoring module
+ *  @{
+ */
+
+/*! \brief Get all groups
+
+Checks the configured performance group path for the current architecture and
+returns all found group names
+@return Amount of found performance groups
+*/
+extern int perfmon_getGroups(char*** groups, char*** shortinfos, char*** longinfos) __attribute__ ((visibility ("default") ));
+
+/*! \brief Free all group information
+
+@param [in] nrgroups Number of groups
+@param [in] groups List of group names
+@param [in] shortinfos List of short information string about group
+@param [in] longinfos List of long information string about group
+*/
+extern void perfmon_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos) __attribute__ ((visibility ("default") ));
+
+/*! \brief Initialize performance monitoring facility
+
+Initialize the performance monitoring feature by creating basic data structures.
+The CPU ids for the threadsToCpu list can be found in cpuTopology->threadPool[thread_id]->apicId.
+The access mode must already be set when calling perfmon_init().
+\sa HPMmode() function and CpuTopology structure with HWThread list
+
+@param [in] nrThreads Amount of threads
+@param [in] threadsToCpu List of CPUs
+@return error code (0 on success, -ERRORCODE on failure)
+*/
+extern int perfmon_init(int nrThreads, const int* threadsToCpu) __attribute__ ((visibility ("default") ));
+
+/*! \brief Initialize performance monitoring maps
+
+Initialize the performance monitoring maps for counters, events and Uncore boxes
+for the current architecture. topology_init() and numa_init() must be called before calling
+perfmon_init_maps()
+\sa RegisterMap list, PerfmonEvent list and BoxMap list
+*/
+extern void perfmon_init_maps(void) __attribute__ ((visibility ("default") ));
+/*! \brief Check the performance monitoring maps whether counters and events are available
+
+Checks each counter and event in the performance monitoring maps for their availibility on
+the current system. topology_init(), numa_init() and perfmon_init_maps() must be called before calling
+perfmon_check_counter_map().
+\sa RegisterMap list, PerfmonEvent list and BoxMap list
+*/
+extern void perfmon_check_counter_map(int cpu_id) __attribute__ ((visibility ("default") ));
+/*! \brief Add an event string to LIKWID
+
+A event string looks like Eventname:Countername(:Option1:Option2:...),...
+The eventname, countername and options are checked if they are available.
+@param [in] eventCString Event string
+@return Returns the ID of the new eventSet
+*/
+extern int perfmon_addEventSet(const char* eventCString) __attribute__ ((visibility ("default") ));
+/*! \brief Setup all performance monitoring counters of an eventSet
+
+@param [in] groupId (returned from perfmon_addEventSet()
+@return error code (-ENOENT if groupId is invalid and -1 if the counters of one CPU cannot be set up)
+*/
+extern int perfmon_setupCounters(int groupId) __attribute__ ((visibility ("default") ));
+/*! \brief Start performance monitoring counters
+
+Start the counters that have been previously set up by perfmon_setupCounters().
+The counter registered are zeroed before enabling the counters
+@return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_startCounters(void) __attribute__ ((visibility ("default") ));
+/*! \brief Stop performance monitoring counters
+
+Stop the counters that have been previously started by perfmon_startCounters().
+This function reads the counters, so afterwards the results are availble through
+perfmon_getResult, perfmon_getLastResult, perfmon_getMetric and perfmon_getLastMetric.
+@return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_stopCounters(void) __attribute__ ((visibility ("default") ));
+/*! \brief Read the performance monitoring counters on all CPUs
+
+Read the counters that have been previously started by perfmon_startCounters().
+The counters are stopped directly to avoid interference of LIKWID with the measured
+code. Before returning, the counters are started again.
+@return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_readCounters(void) __attribute__ ((visibility ("default") ));
+/*! \brief Read the performance monitoring counters on one CPU
+
+Read the counters that have been previously started by perfmon_startCounters().
+The counters are stopped directly to avoid interference of LIKWID with the measured
+code. Before returning, the counters are started again. Only one CPU is read.
+@param [in] cpu_id CPU ID of the CPU that should be read
+@return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_readCountersCpu(int cpu_id) __attribute__ ((visibility ("default") ));
+/*! \brief Read the performance monitoring counters of all threads in a group
+
+Read the counters that have been previously started by perfmon_startCounters().
+The counters are stopped directly to avoid interference of LIKWID with the measured
+code. Before returning, the counters are started again.
+@param [in] groupId Read the counters for all threads taking part in group
+@return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_readGroupCounters(int groupId) __attribute__ ((visibility ("default") ));
+/*! \brief Read the performance monitoring counters of on thread in a group
+
+Read the counters that have been previously started by perfmon_startCounters().
+The counters are stopped directly to avoid interference of LIKWID with the measured
+code. Before returning, the counters are started again. Only one thread's CPU is read.
+@param [in] groupId Read the counters defined in group identified with groupId
+@param [in] threadId Read the counters for the thread
+@return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_readGroupThreadCounters(int groupId, int threadId) __attribute__ ((visibility ("default") ));
+/*! \brief Switch the active eventSet to a new one
+
+Stops the currently running counters, switches the eventSet by setting up the
+counters and start the counters.
+@param [in] new_group ID of group that should be switched to.
+@return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_switchActiveGroup(int new_group) __attribute__ ((visibility ("default") ));
+/*! \brief Close the perfomance monitoring facility of LIKWID
+
+Deallocates all internal data that is used during performance monitoring. Also
+the counter values are not accessible after this function.
+*/
+extern void perfmon_finalize(void) __attribute__ ((visibility ("default") ));
+/*! \brief Get the results of the specified group, counter and thread
+
+Get the result of all measurement cycles. The function takes care of happened
+overflows and if the counter values need to be calculated with multipliers.
+@param [in] groupId ID of the group that should be read
+@param [in] eventId ID of the event that should be read
+@param [in] threadId ID of the thread/cpu that should be read
+@return The counter result
+*/
+extern double perfmon_getResult(int groupId, int eventId, int threadId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the last results of the specified group, counter and thread
+
+Get the result of the last measurement cycle. The function takes care of happened
+overflows and if the counter values need to be calculated with multipliers.
+@param [in] groupId ID of the group that should be read
+@param [in] eventId ID of the event that should be read
+@param [in] threadId ID of the thread/cpu that should be read
+@return The counter result
+*/
+extern double perfmon_getLastResult(int groupId, int eventId, int threadId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the metric result of the specified group, counter and thread
+
+Get the metric result of all measurement cycles. It reads all raw results for the given groupId and threadId.
+@param [in] groupId ID of the group that should be read
+@param [in] metricId ID of the metric that should be calculated
+@param [in] threadId ID of the thread/cpu that should be read
+@return The metric result
+*/
+extern double perfmon_getMetric(int groupId, int metricId, int threadId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the last metric result of the specified group, counter and thread
+
+Get the metric result of the last measurement cycle. It reads all raw results for the given groupId and threadId.
+@param [in] groupId ID of the group that should be read
+@param [in] metricId ID of the metric that should be calculated
+@param [in] threadId ID of the thread/cpu that should be read
+@return The metric result
+*/
+extern double perfmon_getLastMetric(int groupId, int metricId, int threadId) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get the number of configured event groups
+
+@return Number of groups
+*/
+extern int perfmon_getNumberOfGroups(void) __attribute__ ((visibility ("default") ));
+/*! \brief Get the number of configured eventSets in group
+
+@param [in] groupId ID of group
+@return Number of eventSets
+*/
+extern int perfmon_getNumberOfEvents(int groupId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the accumulated measurement time a group
+
+@param [in] groupId ID of group
+@return Time in seconds the event group was measured
+*/
+extern double perfmon_getTimeOfGroup(int groupId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the ID of the currently set up event group
+
+@return Number of active group
+*/
+extern int perfmon_getIdOfActiveGroup(void) __attribute__ ((visibility ("default") ));
+/*! \brief Get the number of threads specified at perfmon_init()
+
+@return Number of threads
+*/
+extern int perfmon_getNumberOfThreads(void) __attribute__ ((visibility ("default") ));
+
+/*! \brief Set verbosity of LIKWID library
+
+*/
+extern void perfmon_setVerbosity(int verbose) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get the event name of the specified group and event
+
+Get the metric name as defined in the performance group file
+@param [in] groupId ID of the group that should be read
+@param [in] eventId ID of the event that should be returned
+@return The event name or NULL in case of failure
+*/
+extern char* perfmon_getEventName(int groupId, int eventId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the counter name of the specified group and event
+
+Get the counter name as defined in the performance group file
+@param [in] groupId ID of the group that should be read
+@param [in] eventId ID of the event of which the counter should be returned
+@return The counter name or NULL in case of failure
+*/
+extern char* perfmon_getCounterName(int groupId, int eventId) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get the name group
+
+Get the name of group. Either it is the name of the performance group or "Custom"
+@param [in] groupId ID of the group that should be read
+@return The group name or NULL in case of failure
+*/
+extern char* perfmon_getGroupName(int groupId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the metric name of the specified group and metric
+
+Get the metric name as defined in the performance group file
+@param [in] groupId ID of the group that should be read
+@param [in] metricId ID of the metric that should be calculated
+@return The metric name or NULL in case of failure
+*/
+extern char* perfmon_getMetricName(int groupId, int metricId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the short informational string of the specified group
+
+Returns the short information string as defined by performance groups or "Custom"
+in case of custom event sets
+@param [in] groupId ID of the group that should be read
+@return The short information or NULL in case of failure
+*/
+extern char* perfmon_getGroupInfoShort(int groupId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the long descriptive string of the specified group
+
+Returns the long descriptive string as defined by performance groups or NULL
+in case of custom event sets
+@param [in] groupId ID of the group that should be read
+@return The long description or NULL in case of failure
+*/
+extern char* perfmon_getGroupInfoLong(int groupId) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get the number of configured metrics for group
+
+@param [in] groupId ID of group
+@return Number of metrics
+*/
+extern int perfmon_getNumberOfMetrics(int groupId) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get the last measurement time a group
+
+@param [in] groupId ID of group
+@return Time in seconds the event group was measured the last time
+*/
+extern double perfmon_getLastTimeOfGroup(int groupId) __attribute__ ((visibility ("default") ));
+
+/*! \brief Read the output file of the Marker API
+@param [in] filename Filename with Marker API results
+@return 0 or negative error number
+*/
+extern int perfmon_readMarkerFile(const char* filename) __attribute__ ((visibility ("default") ));
+/*! \brief Free space for read in Marker API file
+*/
+extern void perfmon_destroyMarkerResults() __attribute__ ((visibility ("default") ));
+/*! \brief Get the number of regions listed in Marker API result file
+
+@return Number of regions
+*/
+extern int perfmon_getNumberOfRegions() __attribute__ ((visibility ("default") ));
+/*! \brief Get the groupID of a region
+
+@param [in] region ID of region
+@return Group ID of region
+*/
+extern int perfmon_getGroupOfRegion(int region) __attribute__ ((visibility ("default") ));
+/*! \brief Get the tag of a region
+@param [in] region ID of region
+@return tag of region
+*/
+extern char* perfmon_getTagOfRegion(int region) __attribute__ ((visibility ("default") ));
+/*! \brief Get the number of events of a region
+@param [in] region ID of region
+@return Number of events of region
+*/
+extern int perfmon_getEventsOfRegion(int region) __attribute__ ((visibility ("default") ));
+/*! \brief Get the number of metrics of a region
+@param [in] region ID of region
+@return Number of metrics of region
+*/
+extern int perfmon_getMetricsOfRegion(int region) __attribute__ ((visibility ("default") ));
+/*! \brief Get the number of threads of a region
+@param [in] region ID of region
+@return Number of threads of region
+*/
+extern int perfmon_getThreadsOfRegion(int region) __attribute__ ((visibility ("default") ));
+/*! \brief Get the cpulist of a region
+@param [in] region ID of region
+@param [in] count Length of cpulist array
+@param [in,out] cpulist cpulist array
+@return Number of threads of region or count, whatever is lower
+*/
+extern int perfmon_getCpulistOfRegion(int region, int count, int* cpulist)  __attribute__ ((visibility ("default") ));
+/*! \brief Get the accumulated measurement time of a region for a thread
+@param [in] region ID of region
+@param [in] thread ID of thread
+@return Measurement time of a region for a thread
+*/
+extern double perfmon_getTimeOfRegion(int region, int thread) __attribute__ ((visibility ("default") ));
+/*! \brief Get the call count of a region for a thread
+@param [in] region ID of region
+@param [in] thread ID of thread
+@return Call count of a region for a thread
+*/
+extern int perfmon_getCountOfRegion(int region, int thread) __attribute__ ((visibility ("default") ));
+/*! \brief Get the event result of a region for an event and thread
+@param [in] region ID of region
+@param [in] event ID of event
+@param [in] thread ID of thread
+@return Result of a region for an event and thread
+*/
+extern double perfmon_getResultOfRegionThread(int region, int event, int thread) __attribute__ ((visibility ("default") ));
+/*! \brief Get the metric result of a region for a metric and thread
+@param [in] region ID of region
+@param [in] metricId ID of metric
+@param [in] threadId ID of thread
+@return Metric result of a region for a thread
+*/
+extern double perfmon_getMetricOfRegionThread(int region, int metricId, int threadId) __attribute__ ((visibility ("default") ));
+
+/** @}*/
+
+/*
+################################################################################
+# Performance group related functions
+################################################################################
+*/
+
+/** \addtogroup PerfGroup performance group module
+ *  @{
+ */
+
+/*! \brief The groupInfo data structure describes a performance group
+
+Groups can be either be read in from file or be a group with custom event set. For
+performance groups commonly all values are set. For groups with custom event set,
+the fields groupname and shortinfo are set to 'Custom', longinfo is NULL and in
+general the nmetrics value is 0.
+*/
+typedef struct {
+    char* groupname; /*!< \brief Name of the group: performance group name or 'Custom' */
+    char* shortinfo; /*!< \brief Short info string for the group or 'Custom' */
+    int nevents; /*!< \brief Number of event/counter combinations */
+    char** events; /*!< \brief List of events */
+    char** counters; /*!< \brief List of counter registers */
+    int nmetrics; /*!< \brief Number of metrics */
+    char** metricnames; /*!< \brief Metric names */
+    char** metricformulas; /*!< \brief Metric formulas */
+    char* longinfo; /*!< \brief Descriptive text about the group or empty */
+} GroupInfo;
+
+/*! \brief Initialize values in GroupInfo struct
+
+Initialize values in GroupInfo struct. The function does NOT allocate the GroupInfo struct
+*/
+int perfgroup_new(GroupInfo* ginfo) __attribute__ ((visibility ("default") ));
+
+/*! \brief Add a counter and event combination to the group
+
+Add a counter and event combination to the group.
+@param [in] ginfo GroupInfo struct
+@param [in] counter String with counter name
+@param [in] event String with event name
+@return 0 for success, -EINVAL or -ENOMEM in case of error.
+*/
+int perfgroup_addEvent(GroupInfo* ginfo, char* counter, char* event) __attribute__ ((visibility ("default") ));
+
+/*! \brief Remove a counter and event combination from a group
+
+Remove a counter and event combination from a group
+@param [in] ginfo GroupInfo struct
+@param [in] counter String with counter name
+*/
+void perfgroup_removeEvent(GroupInfo* ginfo, char* counter) __attribute__ ((visibility ("default") ));
+
+/*! \brief Add a metric to the group
+
+Add a metric to the group
+@param [in] ginfo GroupInfo struct
+@param [in] mname String with metric name/description
+@param [in] mcalc String with metric formula. No spaces in string.
+@return 0 for success, -EINVAL or -ENOMEM in case of error.
+*/
+int perfgroup_addMetric(GroupInfo* ginfo, char* mname, char* mcalc) __attribute__ ((visibility ("default") ));
+/*! \brief Remove a metric from a group
+
+Remove a metric from a group
+@param [in] ginfo GroupInfo struct
+@param [in] mname String with metric name/description
+*/
+void perfgroup_removeMetric(GroupInfo* ginfo, char* mname) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get the event string of a group needed for perfmon_addEventSet
+
+Get the event string of a group needed for perfmon_addEventSet
+@param [in] ginfo GroupInfo struct
+@return String with eventset or NULL
+*/
+char* perfgroup_getEventStr(GroupInfo* ginfo) __attribute__ ((visibility ("default") ));
+/*! \brief Return the eventset string of a group
+
+Return the event string of a group
+@param [in] eventStr Eventset string
+*/
+void perfgroup_returnEventStr(char* eventStr) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get the group name of a group
+
+Get the group name of a group
+@param [in] ginfo GroupInfo struct
+@return String with group name or NULL
+*/
+char* perfgroup_getGroupName(GroupInfo* ginfo) __attribute__ ((visibility ("default") ));
+/*! \brief Set the group name of a group
+
+Set the group name of a group. String must be zero-terminated
+@param [in] ginfo GroupInfo struct
+@param [in] groupName String with group name
+@return 0 for success, -EINVAL or -ENOMEM in case of error.
+*/
+int perfgroup_setGroupName(GroupInfo* ginfo, char* groupName) __attribute__ ((visibility ("default") ));
+/*! \brief Return the group name string of a group
+
+Return the group name string of a group
+@param [in] gname Group name string
+*/
+void perfgroup_returnGroupName(char* gname) __attribute__ ((visibility ("default") ));
+
+
+/*! \brief Set the short information string of a group
+
+Set the short information string of a group. String must be zero-terminated
+@param [in] ginfo GroupInfo struct
+@param [in] shortInfo String with short information
+@return 0 for success, -EINVAL or -ENOMEM in case of error.
+*/
+int perfgroup_setShortInfo(GroupInfo* ginfo, char* shortInfo) __attribute__ ((visibility ("default") ));
+/*! \brief Get the short information string of a group
+
+Get the short information string of a group
+@param [in] ginfo GroupInfo struct
+@return String with short information or NULL
+*/
+char* perfgroup_getShortInfo(GroupInfo* ginfo) __attribute__ ((visibility ("default") ));
+/*! \brief Return the short information string of a group
+
+Return the short information string of a group
+@param [in] sinfo Short information string
+*/
+void perfgroup_returnShortInfo(char* sinfo) __attribute__ ((visibility ("default") ));
+
+/*! \brief Set the long information string of a group
+
+Set the long information string of a group. String must be zero-terminated
+@param [in] ginfo GroupInfo struct
+@param [in] longInfo String with long information
+@return 0 for success, -EINVAL or -ENOMEM in case of error.
+*/
+int perfgroup_setLongInfo(GroupInfo* ginfo, char* longInfo) __attribute__ ((visibility ("default") ));
+/*! \brief Get the long information string of a group
+
+Get the long information string of a group
+@param [in] ginfo GroupInfo struct
+@return String with long information or NULL
+*/
+char* perfgroup_getLongInfo(GroupInfo* ginfo) __attribute__ ((visibility ("default") ));
+/*! \brief Return the long information string of a group
+
+Return the long information string of a group
+@param [in] linfo Long information string
+*/
+void perfgroup_returnLongInfo(char* linfo) __attribute__ ((visibility ("default") ));
+
+/*! \brief Merge two groups
+
+Merge two groups (group2 into group1).
+@param [in,out] grp1 Group1
+@param [in] grp2 Group2
+@return 0 for success, -EINVAL or -ENOMEM in case of error.
+*/
+int perfgroup_mergeGroups(GroupInfo* grp1, GroupInfo* grp2) __attribute__ ((visibility ("default") ));
+
+/*! \brief Read group from file
+
+Read group from file
+@param [in] grouppath Base path to all groups
+@param [in] architecture Architecture string (e.g. short_info in cpuid_info)
+@param [in] groupname Group name
+@param [in,out] ginfo Group filled with data from file
+@return 0 for success, -EINVAL or -ENOMEM in case of error.
+*/
+int perfgroup_readGroup(const char* grouppath, const char* architecture, const char* groupname, GroupInfo* ginfo) __attribute__ ((visibility ("default") ));
+/*! \brief Create group from event string
+
+Create group from event string (list of event:counter(:opts)).
+@param [in] eventStr event string
+@param [in,out] ginfo Group filled with data from event string
+@return 0 for success, -EINVAL or -ENOMEM in case of error.
+*/
+int perfgroup_customGroup(const char* eventStr, GroupInfo* ginfo) __attribute__ ((visibility ("default") ));
+
+/*! \brief Return group
+
+Return group (frees internal lists)
+@param [in] ginfo Performance group info
+*/
+void perfgroup_returnGroup(GroupInfo* ginfo) __attribute__ ((visibility ("default") ));
+/*! \brief Get all groups available in the system (base + user home)
+
+Get all groups available in the system (base + user home)
+@param [in] grouppath Base path to all groups
+@param [in] architecture Architecture string (e.g. short_info in cpuid_info)
+@param [out] groupnames List of group names
+@param [out] groupshort List of groups' short information string
+@param [out] grouplong List of groups' long information string
+@return number of groups, -EINVAL or -ENOMEM in case of error.
+*/
+int perfgroup_getGroups( const char* grouppath, const char* architecture, char*** groupnames, char*** groupshort, char*** grouplong) __attribute__ ((visibility ("default") ));
+/*! \brief Return list of all groups
+
+Return list of all groups
+@param [in] groups Number of groups
+@param [in] groupnames List of group names
+@param [in] groupshort List of groups' short information string
+@param [in] grouplong List of groups' long information string
+*/
+void perfgroup_returnGroups(int groups, char** groupnames, char** groupshort, char** grouplong) __attribute__ ((visibility ("default") ));
+
+
+
+
+/** @}*/
+
+/*
+################################################################################
+# Time measurements related functions
+################################################################################
+*/
+
+/** \addtogroup TimerMon Time measurement module
+ *  @{
+ */
+
+/*! \brief Struct defining the start and stop time of a time interval
+\extends TimerData
+*/
+typedef union
+{
+    uint64_t int64; /*!< \brief Cycle count in 64 bit */
+    struct {uint32_t lo, hi;} int32; /*!< \brief Cycle count stored in two 32 bit fields */
+} TscCounter;
+
+/*! \brief Struct defining the start and stop time of a time interval
+*/
+typedef struct {
+    TscCounter start; /*!< \brief Cycles at start */
+    TscCounter stop; /*!< \brief Cycles at stop */
+} TimerData;
+
+/*! \brief Initialize timer by retrieving baseline frequency and cpu clock
+*/
+extern void timer_init( void ) __attribute__ ((visibility ("default") ));
+/*! \brief Return the measured interval in seconds
+
+@param [in] time Structure holding the cycle count at start and stop
+@return Time in seconds
+*/
+extern double timer_print( const TimerData* time) __attribute__ ((visibility ("default") ));
+/*! \brief Return the measured interval in cycles
+
+@param [in] time Structure holding the cycle count at start and stop
+@return Time in cycles
+*/
+extern uint64_t timer_printCycles( const TimerData* time) __attribute__ ((visibility ("default") ));
+/*! \brief Reset values in TimerData
+
+@param [in] time Structure holding the cycle count at start and stop
+*/
+extern void timer_reset( TimerData* time ) __attribute__ ((visibility ("default") ));
+/*! \brief Return the CPU clock determined at timer_init
+
+@return CPU clock
+*/
+extern uint64_t timer_getCpuClock( void ) __attribute__ ((visibility ("default") ));
+/*! \brief Return the current CPU clock read from sysfs
+
+@return CPU clock
+*/
+extern uint64_t timer_getCpuClockCurrent( int cpu_id ) __attribute__ ((visibility ("default") ));
+/*! \brief Return the cycles clock determined at timer_init
+
+@return cycle clock
+*/
+extern uint64_t timer_getCycleClock( void ) __attribute__ ((visibility ("default") ));
+/*! \brief Return the baseline CPU clock determined at timer_init
+
+@return Baseline CPU clock
+*/
+extern uint64_t timer_getBaseline( void ) __attribute__ ((visibility ("default") ));
+/*! \brief Start time measurement
+
+@param [in,out] time Structure holding the cycle count at start
+*/
+extern void timer_start( TimerData* time ) __attribute__ ((visibility ("default") ));
+/*! \brief Stop time measurement
+
+@param [in,out] time Structure holding the cycle count at stop
+*/
+extern void timer_stop ( TimerData* time) __attribute__ ((visibility ("default") ));
+/*! \brief Sleep for specified usecs
+
+@param [in] usec Amount of usecs to sleep
+*/
+extern int timer_sleep(unsigned long usec) __attribute__ ((visibility ("default") ));
+
+/*! \brief Finalize timer module
+
+*/
+extern void timer_finalize(void) __attribute__ ((visibility ("default") ));
+
+/** @}*/
+
+/*
+################################################################################
+# Power measurements related functions
+################################################################################
+*/
+/** \addtogroup PowerMon Power and Energy monitoring module
+ *  @{
+ */
+
+/*!
+\def NUM_POWER_DOMAINS
+Amount of currently supported RAPL domains
+*/
+#define NUM_POWER_DOMAINS 5
+/*! \brief List of all RAPL domain names
+*/
+extern const char* power_names[NUM_POWER_DOMAINS] __attribute__ ((visibility ("default") ));
+
+/*!
+\def POWER_DOMAIN_SUPPORT_STATUS
+Flag to check in PowerDomain's supportFlag if the status msr registers are available
+*/
+#define POWER_DOMAIN_SUPPORT_STATUS (1ULL<<0)
+/*!
+\def POWER_DOMAIN_SUPPORT_LIMIT
+Flag to check in PowerDomain's supportFlag if the limit msr registers are available
+*/
+#define POWER_DOMAIN_SUPPORT_LIMIT (1ULL<<1)
+/*!
+\def POWER_DOMAIN_SUPPORT_POLICY
+Flag to check in PowerDomain's supportFlag if the policy msr registers are available
+*/
+#define POWER_DOMAIN_SUPPORT_POLICY (1ULL<<2)
+/*!
+\def POWER_DOMAIN_SUPPORT_PERF
+Flag to check in PowerDomain's supportFlag if the perf msr registers are available
+*/
+#define POWER_DOMAIN_SUPPORT_PERF (1ULL<<3)
+/*!
+\def POWER_DOMAIN_SUPPORT_INFO
+Flag to check in PowerDomain's supportFlag if the info msr registers are available
+*/
+#define POWER_DOMAIN_SUPPORT_INFO (1ULL<<4)
+
+
+/*! \brief Information structure of CPU's turbo mode
+\extends PowerInfo
+*/
+typedef struct {
+    int numSteps; /*!< \brief Amount of turbo mode steps/frequencies */
+    double* steps; /*!< \brief List of turbo mode steps */
+} TurboBoost;
+
+/*! \brief Enum for all supported RAPL domains
+\extends PowerDomain
+*/
+typedef enum {
+    PKG = 0, /*!< \brief PKG domain, mostly one CPU socket/package */
+    PP0 = 1, /*!< \brief PP0 domain, not clearly defined by Intel */
+    PP1 = 2, /*!< \brief PP1 domain, not clearly defined by Intel */
+    DRAM = 3, /*!< \brief DRAM domain, the memory modules */
+    PLATFORM = 4 /*!< \brief PLATFORM domain, the whole system (if powered through the main board) */
+} PowerType;
+
+/*! \brief Structure describing an RAPL power domain
+\extends PowerInfo
+*/
+typedef struct {
+    PowerType type; /*!< \brief Identifier which RAPL domain is managed by this struct */
+    uint32_t supportFlags; /*!< \brief Bitmask which features are supported by the power domain */
+    double energyUnit; /*!< \brief Multiplier for energy measurements */
+    double tdp; /*!< \brief Thermal Design Power (maximum amount of heat generated by the CPU) */
+    double minPower; /*!< \brief Minimal power consumption of the CPU */
+    double maxPower; /*!< \brief Maximal power consumption of the CPU */
+    double maxTimeWindow; /*!< \brief Minimal power measurement interval */
+} PowerDomain;
+
+/*! \brief Information structure of CPU's power measurement facility
+*/
+typedef struct {
+    double baseFrequency; /*!< \brief Base frequency of the CPU */
+    double minFrequency; /*!< \brief Minimal frequency of the CPU */
+    TurboBoost turbo; /*!< \brief Turbo boost information */
+    int hasRAPL; /*!< \brief RAPL support flag */
+    double powerUnit; /*!< \brief Multiplier for power measurements */
+    double timeUnit; /*!< \brief Multiplier for time information */
+    double uncoreMinFreq; /*!< \brief Minimal uncore frequency */
+    double uncoreMaxFreq; /*!< \brief Maximal uncore frequency */
+    uint8_t perfBias; /*!< \brief Performance energy bias */
+    PowerDomain domains[NUM_POWER_DOMAINS]; /*!< \brief List of power domains */
+} PowerInfo;
+
+/*! \brief Power measurement data for start/stop measurements
+*/
+typedef struct {
+    int domain; /*!< \brief RAPL domain identifier */
+    uint32_t before; /*!< \brief Counter state at start */
+    uint32_t after; /*!< \brief Counter state at stop */
+} PowerData;
+
+/*! \brief Variable holding the global power information structure */
+extern PowerInfo power_info;
+
+/** \brief Pointer for exporting the PowerInfo data structure */
+typedef PowerInfo* PowerInfo_t;
+/** \brief Pointer for exporting the PowerData data structure */
+typedef PowerData* PowerData_t;
+
+/*! \brief Initialize energy measurements on specific CPU
+
+Additionally, it reads basic information about the energy measurements like
+minimal measurement time.
+@param [in] cpuId Initialize energy facility for this CPU
+@return RAPL status (0=No RAPL, 1=RAPL working)
+*/
+extern int power_init(int cpuId) __attribute__ ((visibility ("default") ));
+/*! \brief Get a pointer to the energy facility information
+
+@return PowerInfo_t pointer
+\sa PowerInfo_t
+*/
+extern PowerInfo_t get_powerInfo(void) __attribute__ ((visibility ("default") ));
+/*! \brief Read the current power value
+
+@param [in] cpuId Read energy facility for this CPU
+@param [in] reg Energy register
+@param [out] data Energy data
+*/
+extern int power_read(int cpuId, uint64_t reg, uint32_t *data) __attribute__ ((visibility ("default") ));
+/*! \brief Read the current energy value using a specific communication socket
+
+@param [in] socket_fd Communication socket for the read operation
+@param [in] cpuId Read energy facility for this CPU
+@param [in] reg Energy register
+@param [out] data Energy data
+*/
+extern int power_tread(int socket_fd, int cpuId, uint64_t reg, uint32_t *data) __attribute__ ((visibility ("default") ));
+/*! \brief Start energy measurements
+
+@param [in,out] data Data structure holding start and stop values for energy measurements
+@param [in] cpuId Start energy facility for this CPU
+@param [in] type Which type should be measured
+@return error code
+*/
+extern int power_start(PowerData_t data, int cpuId, PowerType type) __attribute__ ((visibility ("default") ));
+/*! \brief Stop energy measurements
+
+@param [in,out] data Data structure holding start and stop values for energy measurements
+@param [in] cpuId Start energy facility for this CPU
+@param [in] type Which type should be measured
+@return error code
+*/
+extern int power_stop(PowerData_t data, int cpuId, PowerType type) __attribute__ ((visibility ("default") ));
+/*! \brief Print energy measurements gathered by power_start() and power_stop()
+
+@param [in] data Data structure holding start and stop values for energy measurements
+@return Consumed energy in Joules
+*/
+extern double power_printEnergy(const PowerData* data) __attribute__ ((visibility ("default") ));
+/*! \brief Get energy Unit
+
+@param [in] domain RAPL domain ID
+@return Energy unit of the given RAPL domain
+*/
+extern double power_getEnergyUnit(int domain) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get the values of the limit register of a domain
+NOT IMPLEMENTED
+
+@param [in] cpuId CPU ID
+@param [in] domain RAPL domain ID
+@param [out] power Energy limit
+@param [out] time Time limit
+@return error code
+*/
+int power_limitGet(int cpuId, PowerType domain, double* power, double* time) __attribute__ ((visibility ("default") ));
+
+/*! \brief Set the values of the limit register of a domain
+NOT IMPLEMENTED
+
+@param [in] cpuId CPU ID
+@param [in] domain RAPL domain ID
+@param [in] power Energy limit
+@param [in] time Time limit
+@param [in] doClamping Activate clamping (going below OS-requested power level)
+@return error code
+*/
+int power_limitSet(int cpuId, PowerType domain, double power, double time, int doClamping) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get the state of a energy limit, activated or deactivated
+NOT IMPLEMENTED
+
+@param [in] cpuId CPU ID
+@param [in] domain RAPL domain ID
+@return state, 1 for active, 0 for inactive
+*/
+int power_limitState(int cpuId, PowerType domain) __attribute__ ((visibility ("default") ));
+
+/*! \brief Free space of power_unit
+*/
+extern void power_finalize(void) __attribute__ ((visibility ("default") ));
+/** @}*/
+
+/*
+################################################################################
+# Thermal measurements related functions
+################################################################################
+*/
+/** \addtogroup ThermalMon Thermal monitoring module
+ *  @{
+ */
+/*! \brief Initialize thermal measurements on specific CPU
+
+@param [in] cpuId Initialize thermal facility for this CPU
+*/
+extern void thermal_init(int cpuId) __attribute__ ((visibility ("default") ));
+/*! \brief Read the current thermal value
+
+@param [in] cpuId Read thermal facility for this CPU
+@param [out] data Thermal data
+*/
+extern int thermal_read(int cpuId, uint32_t *data) __attribute__ ((visibility ("default") ));
+/*! \brief Read the current thermal value using a specific communication socket
+
+@param [in] socket_fd Communication socket for the read operation
+@param [in] cpuId Read thermal facility for this CPU
+@param [out] data Thermal data
+*/
+extern int thermal_tread(int socket_fd, int cpuId, uint32_t *data) __attribute__ ((visibility ("default") ));
+/** @}*/
+
+
+/*
+################################################################################
+# Memory sweeping related functions
+################################################################################
+*/
+/** \addtogroup MemSweep Memory sweeping module
+ *  @{
+ */
+/*! \brief Sweeping the memory of a NUMA node
+
+Sweeps (zeros) the memory of NUMA node with ID \a domainId
+@param [in] domainId NUMA node ID
+*/
+extern void memsweep_domain(int domainId) __attribute__ ((visibility ("default") ));
+/*! \brief Sweeping the memory of all NUMA nodes covered by CPU list
+
+Sweeps (zeros) the memory of all NUMA nodes containing the CPUs in \a processorList
+@param [in] processorList List of CPU IDs
+@param [in] numberOfProcessors Number of CPUs in list
+*/
+extern void memsweep_threadGroup(const int* processorList, int numberOfProcessors) __attribute__ ((visibility ("default") ));
+/** @}*/
+
+/*
+################################################################################
+# CPU feature related functions
+################################################################################
+*/
+/** \addtogroup CpuFeatures Retrieval and manipulation of processor features
+ *  @{
+ */
+/*! \brief Enumeration of all CPU related features.
+*/
+typedef enum {
+    FEAT_HW_PREFETCHER=0, /*!< \brief Hardware prefetcher */
+    FEAT_CL_PREFETCHER, /*!< \brief Adjacent cache line prefetcher */
+    FEAT_DCU_PREFETCHER, /*!< \brief DCU L1 data cache prefetcher */
+    FEAT_IP_PREFETCHER, /*!< \brief IP L1 data cache prefetcher */
+    FEAT_FAST_STRINGS, /*!< \brief Fast-strings feature */
+    FEAT_THERMAL_CONTROL, /*!< \brief Automatic Thermal Control Circuit */
+    FEAT_PERF_MON, /*!< \brief Hardware performance monitoring */
+    FEAT_FERR_MULTIPLEX, /*!< \brief FERR# Multiplexing, must be 1 for XAPIC interrupt model */
+    FEAT_BRANCH_TRACE_STORAGE, /*!< \brief Branch Trace Storage */
+    FEAT_XTPR_MESSAGE, /*!< \brief xTPR Message to set processor priority */
+    FEAT_PEBS, /*!< \brief Precise Event Based Sampling (PEBS) */
+    FEAT_SPEEDSTEP, /*!< \brief Enhanced Intel SpeedStep Technology to reduce energy consumption*/
+    FEAT_MONITOR, /*!< \brief MONITOR/MWAIT feature to monitor write-back stores*/
+    FEAT_SPEEDSTEP_LOCK, /*!< \brief Enhanced Intel SpeedStep Technology Select Lock */
+    FEAT_CPUID_MAX_VAL, /*!< \brief Limit CPUID Maxval */
+    FEAT_XD_BIT, /*!< \brief Execute Disable Bit */
+    FEAT_DYN_ACCEL, /*!< \brief Intel Dynamic Acceleration */
+    FEAT_TURBO_MODE, /*!< \brief Intel Turbo Mode */
+    FEAT_TM2, /*!< \brief Thermal Monitoring 2 */
+    CPUFEATURES_MAX
+} CpuFeature;
+
+/*! \brief Initialize the internal feature variables for all CPUs
+
+Initialize the internal feature variables for all CPUs
+*/
+extern void cpuFeatures_init() __attribute__ ((visibility ("default") ));
+/*! \brief Print state of all CPU features for a given CPU
+
+Print state of all CPU features for a given CPU
+@param [in] cpu CPU ID
+*/
+extern void cpuFeatures_print(int cpu) __attribute__ ((visibility ("default") ));
+/*! \brief Get state of a CPU feature for a given CPU
+
+Get state of a CPU feature for a given CPU
+@param [in] cpu CPU ID
+@param [in] type CPU feature
+@return State of CPU feature (1=enabled, 0=disabled)
+*/
+extern int cpuFeatures_get(int cpu, CpuFeature type)  __attribute__ ((visibility ("default") ));
+/*! \brief Get the name of a CPU feature
+
+Get the name of a CPU feature
+@param [in] type CPU feature
+@return Name of the CPU feature or NULL if feature is not available
+*/
+extern char* cpuFeatures_name(CpuFeature type)  __attribute__ ((visibility ("default") ));
+/*! \brief Enable a CPU feature for a specific CPU
+
+Enable a CPU feature for a specific CPU. Only the state of the prefetchers can be changed, all other features return -EINVAL
+@param [in] cpu CPU ID
+@param [in] type CPU feature
+@param [in] print Print outcome of operation
+@return Status of operation (0=success, all others are erros, either by MSR access or invalid feature)
+*/
+extern int cpuFeatures_enable(int cpu, CpuFeature type, int print) __attribute__ ((visibility ("default") ));
+/*! \brief Disable a CPU feature for a specific CPU
+
+Disable a CPU feature for a specific CPU. Only the state of the prefetchers can be changed, all other features return -EINVAL
+@param [in] cpu CPU ID
+@param [in] type CPU feature
+@param [in] print Print outcome of operation
+@return Status of operation (0=success, all others are erros, either by MSR access or invalid feature)
+*/
+extern int cpuFeatures_disable(int cpu, CpuFeature type, int print) __attribute__ ((visibility ("default") ));
+/** @}*/
+
+
+/*
+################################################################################
+# CPU frequency related functions
+################################################################################
+*/
+/** \addtogroup CpuFreq Retrieval and manipulation of processor clock frequencies
+ *  @{
+ */
+/*! \brief Initialize cpu frequency module
+
+Initialize cpu frequency module
+@return returns 0 if successfull and 1 if invalid accessmode
+*/
+extern int freq_init(void) __attribute__ ((visibility ("default") ));
+/*! \brief Get the base clock frequency of a hardware thread
+
+Get the base clock frequency of a hardware thread
+@param [in] cpu_id CPU ID
+@return Frequency or 0 in case of errors
+*/
+uint64_t freq_getCpuClockBase(const int cpu_id) __attribute__ ((visibility ("default") ));
+/*! \brief Get the current clock frequency of a hardware thread
+
+Get the current clock frequency of a hardware thread
+@param [in] cpu_id CPU ID
+@return Frequency or 0 in case of errors
+*/
+extern uint64_t freq_getCpuClockCurrent(const int cpu_id ) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get the maximal clock frequency of a hardware thread
+
+Get the maximal clock frequency of a hardware thread
+@param [in] cpu_id CPU ID
+@return Frequency or 0 in case of errors
+*/
+extern uint64_t freq_getCpuClockMax(const int cpu_id ) __attribute__ ((visibility ("default") ));
+/*! \brief Get the maximal available clock frequency of a hardware thread
+
+Get the maximal clock frequency of a hardware thread
+@param [in] cpu_id CPU ID
+@return Frequency or 0 in case of errors
+*/
+extern uint64_t freq_getConfCpuClockMax(const int cpu_id) __attribute__ ((visibility ("default") ));
+/*! \brief Set the maximal clock frequency of a hardware thread
+
+Set the maximal clock frequency of a hardware thread
+@param [in] cpu_id CPU ID
+@param [in] freq Frequency in kHz
+@return Frequency or 0 in case of errors
+*/
+extern uint64_t freq_setCpuClockMax(const int cpu_id, const uint64_t freq) __attribute__ ((visibility ("default") ));
+/*! \brief Get the minimal clock frequency of a hardware thread
+
+Get the minimal clock frequency of a hardware thread
+@param [in] cpu_id CPU ID
+@return Frequency or 0 in case of errors
+*/
+extern uint64_t freq_getCpuClockMin(const int cpu_id ) __attribute__ ((visibility ("default") ));
+/*! \brief Get the minimal available clock frequency of a hardware thread
+
+Get the minimal clock frequency of a hardware thread
+@param [in] cpu_id CPU ID
+@return Frequency or 0 in case of errors
+*/
+extern uint64_t freq_getConfCpuClockMin(const int cpu_id) __attribute__ ((visibility ("default") ));
+/*! \brief Set the minimal clock frequency of a hardware thread
+
+Set the minimal clock frequency of a hardware thread
+@param [in] cpu_id CPU ID
+@param [in] freq Frequency in kHz
+@return Frequency or 0 in case of errors
+*/
+extern uint64_t freq_setCpuClockMin(const int cpu_id, const uint64_t freq) __attribute__ ((visibility ("default") ));
+/*! \brief De/Activate turbo mode for a hardware thread
+
+De/Activate turbo mode for a hardware thread
+@param [in] cpu_id CPU ID
+@param [in] turbo (0=off, 1=on)
+@return 1 or 0 in case of errors
+*/
+extern int freq_setTurbo(const int cpu_id, int turbo) __attribute__ ((visibility ("default") ));
+/*! \brief Get state of turbo mode for a hardware thread
+
+Get state of turbo mode for a hardware thread
+@param [in] cpu_id CPU ID
+@return 1=Turbo active or 0=Turbo inactive
+*/
+extern int freq_getTurbo(const int cpu_id) __attribute__ ((visibility ("default") ));
+/*! \brief Get the frequency governor of a hardware thread
+
+Get the frequency governor of a hardware thread. The returned string must be freed by the caller.
+@param [in] cpu_id CPU ID
+@return Governor or NULL in case of errors
+*/
+extern char * freq_getGovernor(const int cpu_id ) __attribute__ ((visibility ("default") ));
+/*! \brief Set the frequency governor of a hardware thread
+
+Set the frequency governor of a hardware thread.
+@param [in] cpu_id CPU ID
+@param [in] gov Governor
+@return 1 or 0 in case of errors
+*/
+extern int freq_setGovernor(const int cpu_id, const char* gov) __attribute__ ((visibility ("default") ));
+/*! \brief Get the available frequencies of a hardware thread
+
+Get the available frequencies of a hardware thread. The returned string must be freed by the caller.
+@param [in] cpu_id CPU ID
+@return String with available frequencies or NULL in case of errors
+*/
+extern char * freq_getAvailFreq(const int cpu_id ) __attribute__ ((visibility ("default") ));
+/*! \brief Get the available frequency governors of a hardware thread
+
+Get the available frequency governors of a hardware thread. The returned string must be freed by the caller.
+@param [in] cpu_id CPU ID
+@return String with available frequency governors or NULL in case of errors
+*/
+extern char * freq_getAvailGovs(const int cpu_id ) __attribute__ ((visibility ("default") ));
+
+/*! \brief Set the minimal Uncore frequency
+
+Set the minimal Uncore frequency. Since the ranges are not documented, valid frequencies are from minimal CPU clock to maximal Turbo clock. If selecting a frequency at the borders, please check the result with the UNCORE_CLOCK event to be effective.
+@param [in] socket_id ID of socket
+@param [in] freq Frequency in MHz
+@return 0 for success, -ERROR at failure
+*/
+extern int freq_setUncoreFreqMin(const int socket_id, const uint64_t freq) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get the minimal Uncore frequency
+
+Get the minimal Uncore frequency.
+@param [in] socket_id ID of socket
+@return frequency in MHz or 0 at failure
+*/
+extern uint64_t freq_getUncoreFreqMin(const int socket_id) __attribute__ ((visibility ("default") ));
+
+/*! \brief Set the maximal Uncore frequency
+
+Set the maximal Uncore frequency. Since the ranges are not documented, valid frequencies are from minimal CPU clock to maximal Turbo clock. If selecting a frequency at the borders, please check the result with the UNCORE_CLOCK event to be effective.
+@param [in] socket_id ID of socket
+@param [in] freq Frequency in MHz
+@return 0 for success, -ERROR at failure
+*/
+extern int freq_setUncoreFreqMax(const int socket_id, const uint64_t freq) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get the maximal Uncore frequency
+
+Get the maximal Uncore frequency.
+@param [in] socket_id ID of socket
+@return frequency in MHz or 0 at failure
+*/
+extern uint64_t freq_getUncoreFreqMax(const int socket_id) __attribute__ ((visibility ("default") ));
+/*! \brief Get the current Uncore frequency
+
+Get the current Uncore frequency.
+@param [in] socket_id ID of socket
+@return frequency in MHz or 0 at failure
+*/
+extern uint64_t freq_getUncoreFreqCur(const int socket_id) __attribute__ ((visibility ("default") ));
+/*! \brief Finalize cpu frequency module
+
+Finalize cpu frequency module
+*/
+extern void freq_finalize(void) __attribute__ ((visibility ("default") ));
+/** @}*/
+
+
+/*
+################################################################################
+# Performance monitoring for NVIDIA GPUs related functions
+################################################################################
+*/
+/** \addtogroup Nvmon Performance monitoring for NVIDIA GPUs
+ *  @{
+ */
+
+#if defined(LIKWID_WITH_NVMON) || defined(LIKWID_NVMON)
+/*! \brief Structure with general GPU information for each device
+
+General information covers GPU devid, name and clock and memory specific information.
+Most information comes from cuDeviceGetProperties() and cuDeviceGetAttribute().
+*/
+typedef struct {
+    int devid; /*!< \brief Device ID  */
+    int numaNode; /*!< \brief Closest NUMA domain to the device */
+    char* name; /*!< \brief Name of the device */
+    char* short_name; /*!< \brief Short name of the device */
+    uint64_t mem; /*!< \brief Total memory of device */
+    int ccapMajor; /*!< \brief Major number of device's compute capability */
+    int ccapMinor; /*!< \brief Minor number of device's compute capability */
+    int maxThreadsPerBlock; /*!< \brief Maximam number of thread per block */
+    int maxThreadsDim[3]; /*!< \brief Maximum sizes of each dimension of a block */
+    int maxGridSize[3]; /*!< \brief Maximum sizes of each dimension of a grid */
+    int sharedMemPerBlock; /*!< \brief Total amount of shared memory available per block */
+    int totalConstantMemory; /*!< \brief Total amount of constant memory available on the device */
+    int simdWidth; /*!< \brief SIMD width of arithmetic units = warp size */
+    int memPitch; /*!< \brief Maximum pitch allowed by the memory copy functions that involve memory regions allocated through cuMemAllocPitch() */
+    int regsPerBlock; /*!< \brief Total number of registers available per block */
+    int clockRatekHz; /*!< \brief Clock frequency in kilohertz */
+    int textureAlign; /*!< \brief Alignment requirement */
+    int surfaceAlign; /*!< \brief Alignment requirement for surfaces */
+    int l2Size; /*!< \brief L2 cache in bytes. 0 if the device doesn't have L2 cache */
+    int memClockRatekHz; /*!< \brief Peak memory clock frequency in kilohertz */
+    int pciBus; /*!< \brief PCI bus identifier of the device */
+    int pciDev; /*!< \brief PCI device (also known as slot) identifier of the device */
+    int pciDom; /*!< \brief PCI domain identifier of the device */
+    int maxBlockRegs; /*!< \brief Maximum number of 32-bit registers available to a thread block */
+    int numMultiProcs; /*!< \brief Number of multiprocessors on the device */
+    int maxThreadPerMultiProc; /*!< \brief Maximum resident threads per multiprocessor */
+    int memBusWidth; /*!< \brief Global memory bus width in bits */
+    int unifiedAddrSpace; /*!< \brief 1 if the device shares a unified address space with the host, or 0 if not */
+    int ecc; /*!< \brief 1 if error correction is enabled on the device, 0 if error correction is disabled or not supported by the device */
+    int asyncEngines; /*!< \brief Number of asynchronous engines */
+    int mapHostMem; /*!< \brief 1 if the device can map host memory into the CUDA address space */
+    int integrated; /*!< \brief 1 if the device is an integrated (motherboard) GPU and 0 if it is a discrete (card) component */
+} GpuDevice;
+
+
+/*! \brief Structure holding information of all GPUs
+
+*/
+typedef struct {
+    int numDevices; /*!< \brief Number of detected devices */
+    GpuDevice* devices; /*!< \brief List with GPU-specific topology information */
+} GpuTopology;
+
+/*! \brief Variable holding the global gpu information structure */
+extern GpuTopology gpuTopology;
+/** \brief Pointer for exporting the GpuTopology data structure */
+typedef GpuTopology* GpuTopology_t;
+
+
+/*! \brief Initialize GPU topology information
+
+Reads in the topology information from the CUDA library (if found).
+\sa GpuTopology_t
+@return 0 or -errno in case of error
+*/
+extern int topology_gpu_init(void) __attribute__ ((visibility ("default") ));
+/*! \brief Destroy GPU topology structure GpuTopology_t
+
+Retrieved pointers to the structures are not valid anymore after this function call
+\sa GpuTopology_t
+*/
+extern void topology_gpu_finalize(void) __attribute__ ((visibility ("default") ));
+/*! \brief Retrieve GPU topology of the current machine
+
+\sa GpuTopology_t
+@return GpuTopology_t (pointer to internal gpuTopology structure)
+*/
+extern GpuTopology_t get_gpuTopology(void) __attribute__ ((visibility ("default") ));
+
+
+/*
+################################################################################
+# NvMarker API related functions
+################################################################################
+*/
+/** \addtogroup NvMarkerAPI Marker API module for GPUs
+*  @{
+*/
+/*! \brief Initialize NvLIKWID's marker API
+
+Must be called in serial region of the application to set up basic data structures
+of LIKWID.
+Reads environment variables:
+- LIKWID_GEVENTS (GPU event string)
+- LIKWID_GPUS (GPU list separated by ,)
+- LIKWID_GPUFILEPATH (Outputpath for NvMarkerAPI file)
+*/
+extern void likwid_gpuMarkerInit(void) __attribute__ ((visibility ("default") ));
+/*! \brief Select next group to measure
+
+Must be called in parallel region of the application to switch group on every CPU.
+*/
+extern void likwid_gpuMarkerNextGroup(void) __attribute__ ((visibility ("default") ));
+/*! \brief Close LIKWID's NvMarker API
+
+Must be called in serial region of the application. It gathers all data of regions and
+writes them out to a file (filepath in env variable LIKWID_FILEPATH).
+*/
+extern void likwid_gpuMarkerClose(void) __attribute__ ((visibility ("default") ));
+/*! \brief Register a measurement region
+
+Initializes the hashTable entry in order to reduce execution time of likwid_gpuMarkerStartRegion()
+@param regionTag [in] Initialize data using this string
+@return Error code
+*/
+extern int likwid_gpuMarkerRegisterRegion(const char* regionTag) __attribute__ ((visibility ("default") ));
+/*! \brief Start a measurement region
+
+Reads the values of all configured counters and saves the results under the name given
+in regionTag.
+@param regionTag [in] Store data using this string
+@return Error code of start operation
+*/
+extern int likwid_gpuMarkerStartRegion(const char* regionTag) __attribute__ ((visibility ("default") ));
+/*! \brief Stop a measurement region
+
+Reads the values of all configured counters and saves the results under the name given
+in regionTag. The measurement data of the stopped region gets summed up in global region counters.
+@param regionTag [in] Store data using this string
+@return Error code of stop operation
+*/
+extern int likwid_gpuMarkerStopRegion(const char* regionTag) __attribute__ ((visibility ("default") ));
+/*! \brief Reset a measurement region
+
+Reset the values of all configured counters and timers.
+@param regionTag [in] Reset data using this string
+@return Error code of reset operation
+*/
+extern int likwid_gpuMarkerResetRegion(const char* regionTag) __attribute__ ((visibility ("default") ));
+/*! \brief Get accumulated data of a code region
+
+Get the accumulated data of the current thread for the given regionTag.
+@param regionTag [in] Print data using this string
+@param nr_gpus [in,out] Length of first dimension of the arrys. Afterwards the actual count of GPUs.
+@param nr_events [in,out] Length of events array
+@param events [out] Events array for the intermediate results
+@param time [out] Accumulated measurement time
+@param count [out] Call count of the code region
+*/
+extern void likwid_gpuMarkerGetRegion(const char* regionTag, int* nr_gpus, int* nr_events, double** events, double **time, int **count) __attribute__ ((visibility ("default") ));
+
+/*! \brief Read the output file of the NvMarker API
+@param [in] filename Filename with NvMarker API results
+@return 0 or negative error number
+*/
+int nvmon_readMarkerFile(const char* filename) __attribute__ ((visibility ("default") ));
+/*! \brief Free space for read in NvMarker API file
+*/
+void nvmon_destroyMarkerResults() __attribute__ ((visibility ("default") ));
+/*! \brief Get the number of regions listed in NvMarker API result file
+
+@return Number of regions
+*/
+int nvmon_getNumberOfRegions() __attribute__ ((visibility ("default") ));
+/*! \brief Get the number of metrics of a region
+@param [in] region ID of region
+@return Number of metrics of region
+*/
+int nvmon_getMetricsOfRegion(int region) __attribute__ ((visibility ("default") ));
+/*! \brief Get the number of GPUs of a region
+@param [in] region ID of region
+@return Number of GPUs of region
+*/
+int nvmon_getGpusOfRegion(int region) __attribute__ ((visibility ("default") ));
+/*! \brief Get the GPU list of a region
+@param [in] region ID of region
+@param [in] count Length of gpulist array
+@param [in,out] gpulist gpulist array
+@return Number of GPUs of region or count, whatever is lower
+*/
+int nvmon_getGpulistOfRegion(int region, int count, int* gpulist) __attribute__ ((visibility ("default") ));
+/*! \brief Get the accumulated measurement time of a region for a GPU
+@param [in] region ID of region
+@param [in] gpu ID of GPU
+@return Measurement time of a region for a GPU
+*/
+double nvmon_getTimeOfRegion(int region, int gpu) __attribute__ ((visibility ("default") ));
+/*! \brief Get the call count of a region for a GPU
+@param [in] region ID of region
+@param [in] gpu ID of GPU
+@return Call count of a region for a GPU
+*/
+int nvmon_getCountOfRegion(int region, int gpu) __attribute__ ((visibility ("default") ));
+/*! \brief Get the groupID of a region
+
+@param [in] region ID of region
+@return Group ID of region
+*/
+int nvmon_getGroupOfRegion(int region) __attribute__ ((visibility ("default") ));
+/*! \brief Get the tag of a region
+@param [in] region ID of region
+@return tag of region
+*/
+char* nvmon_getTagOfRegion(int region) __attribute__ ((visibility ("default") ));
+/*! \brief Get the number of events of a region
+@param [in] region ID of region
+@return Number of events of region
+*/
+int nvmon_getEventsOfRegion(int region) __attribute__ ((visibility ("default") ));
+/*! \brief Get the event result of a region for an event and GPU
+@param [in] region ID of region
+@param [in] eventId ID of event
+@param [in] gpuId ID of GPU
+@return Result of a region for an event and GPU
+*/
+double nvmon_getResultOfRegionGpu(int region, int eventId, int gpuId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the metric result of a region for a metric and GPU
+@param [in] region ID of region
+@param [in] metricId ID of metric
+@param [in] gpuId ID of GPU
+@return Metric result of a region for a GPU
+*/
+double nvmon_getMetricOfRegionGpu(int region, int metricId, int gpuId) __attribute__ ((visibility ("default") ));
+
+/** @}*/
+
+/*
+################################################################################
+# Nvmon related functions (Nvidia GPU monitoring)
+################################################################################
+*/
+
+/** \addtogroup Nvmon Nvidia GPU monitoring API module for GPUs
+*  @{
+*/
+
+/*! \brief Element in the output list from nvmon_getEventsOfGpu
+
+It holds the name, the description and the limitation string for one event.
+*/
+typedef struct {
+    char* name; /*!< \brief Name of the event */
+    char* desc; /*!< \brief Description of the event */
+    char* limit; /*!< \brief Limitation string of the event, commonly 'GPU' */
+} NvmonEventListEntry;
+
+/*! \brief Output list from nvmon_getEventsOfGpu with all supported events
+
+Output list from nvmon_getEventsOfGpu with all supported events
+*/
+typedef struct {
+    int numEvents; /*!< \brief Number of events */
+    NvmonEventListEntry *events; /*!< \brief List of events */
+} NvmonEventList;
+/** \brief Pointer for exporting the NvmonEventList data structure */
+typedef NvmonEventList* NvmonEventList_t;
+
+
+/*! \brief Get the list of supported event of a GPU
+
+@param [in] gpuId ID of GPU (from GPU topology)
+@param [out] list List of events
+@return Number of supported events or -errno
+*/
+int nvmon_getEventsOfGpu(int gpuId, NvmonEventList_t* list);
+/*! \brief Return the list of supported event of a GPU
+
+Return the list of supported event of a GPU from nvmon_getEventsOfGpu()
+@param [in] list List of events
+*/
+void nvmon_returnEventsOfGpu(NvmonEventList_t list);
+
+
+/*! \brief Initialize the Nvidia GPU performance monitoring facility (Nvmon)
+
+Initialize the Nvidia GPU performance monitoring feature by creating basic data structures.
+The CUDA and CUPTI library paths need to be in LD_LIBRARY_PATH to be found by dlopen.
+
+@param [in] nrGpus Amount of GPUs
+@param [in] gpuIds List of GPUs
+@return error code (0 on success, -ERRORCODE on failure)
+*/
+int nvmon_init(int nrGpus, const int* gpuIds) __attribute__ ((visibility ("default") ));
+
+/*! \brief Close the Nvidia GPU perfomance monitoring facility of LIKWID (Nvmon)
+
+Deallocates all internal data that is used during Nvmon performance monitoring. Also
+the counter values are not accessible anymore after calling this function.
+*/
+void nvmon_finalize(void) __attribute__ ((visibility ("default") ));
+/*! \brief Add an event string to LIKWID Nvmon
+
+A event string looks like Eventname:Countername,...
+The eventname and countername are checked if they are available.
+
+@param [in] eventCString Event string
+@return Returns the ID of the new eventSet
+*/
+int nvmon_addEventSet(const char* eventCString) __attribute__ ((visibility ("default") ));
+/*! \brief Setup all Nvmon performance monitoring counters of an eventSet
+
+@param [in] gid (returned from perfmon_addEventSet()
+@return error code (-ENOENT if groupId is invalid and -1 if the counters of one CPU cannot be set up)
+*/
+int nvmon_setupCounters(int gid) __attribute__ ((visibility ("default") ));
+/*! \brief Start Nvmon performance monitoring counters
+
+Start the counters that have been previously set up by nvmon_setupCounters().
+The counter registered are zeroed before enabling the counters
+@return 0 on success and -(gpuid+1) for error
+*/
+int nvmon_startCounters(void) __attribute__ ((visibility ("default") ));
+/*! \brief Stop Nvmon performance monitoring counters
+
+Stop the counters that have been previously started by nvmon_startCounters().
+@return 0 on success and -(gpuid+1) for error
+*/
+int nvmon_stopCounters(void) __attribute__ ((visibility ("default") ));
+/*! \brief Read the Nvmon performance monitoring counters on all GPUs
+
+Read the counters that have been previously started by nvmon_startCounters().
+@return 0 on success and -(gpuid+1) for error
+*/
+int nvmon_readCounters(void) __attribute__ ((visibility ("default") ));
+/*! \brief Switch the active eventSet to a new one (Nvmon)
+
+Stops the currently running counters, switches the eventSet by setting up the
+counters and start the counters.
+@param [in] new_group ID of group that should be switched to.
+@return 0 on success and -(thread_id+1) for error
+*/
+int nvmon_switchActiveGroup(int new_group) __attribute__ ((visibility ("default") ));
+/*! \brief Set verbosity of LIKWID Nvmon library
+
+*/
+void nvmon_setVerbosity(int level) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get the results of the specified group, counter and GPU (Nvmon)
+
+Get the result of all measurement cycles.
+@param [in] groupId ID of the group that should be read
+@param [in] eventId ID of the event that should be read
+@param [in] gpuId ID of the GPU that should be read
+@return The counter result
+*/
+double nvmon_getResult(int groupId, int eventId, int gpuId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the last results of the specified group, counter and GPU (Nvmon)
+
+Get the result of the last measurement cycle (between start/stop, start/read, read/read or read/top).
+@param [in] groupId ID of the group that should be read
+@param [in] eventId ID of the event that should be read
+@param [in] gpuId ID of the GPU that should be read
+@return The counter result
+*/
+double nvmon_getLastResult(int groupId, int eventId, int gpuId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the metric result of the specified group, counter and GPU (Nvmon)
+
+Get the metric result of all measurement cycles. It reads all raw results for the given groupId and gpuId.
+@param [in] groupId ID of the group that should be read
+@param [in] metricId ID of the metric that should be calculated
+@param [in] gpuId ID of the GPU that should be read
+@return The metric result
+*/
+double nvmon_getMetric(int groupId, int metricId, int gpuId);
+/*! \brief Get the last metric result of the specified group, counter and GPU (Nvmon)
+
+Get the metric result of the last measurement cycle. It reads all raw results for the given groupId and gpuId.
+@param [in] groupId ID of the group that should be read
+@param [in] metricId ID of the metric that should be calculated
+@param [in] gpuId ID of the GPU that should be read
+@return The metric result
+*/
+double nvmon_getLastMetric(int groupId, int metricId, int gpuId);
+/*! \brief Get the number of configured event groups (Nvmon)
+
+@return Number of groups
+*/
+int nvmon_getNumberOfGroups(void) __attribute__ ((visibility ("default") ));
+/*! \brief Get the ID of the currently set up event group (Nvmon)
+
+@return Number of active group
+*/
+int nvmon_getIdOfActiveGroup(void) __attribute__ ((visibility ("default") ));
+/*! \brief Get the number of GPUs specified at nvmon_init() (Nvmon)
+
+@return Number of GPUs
+*/
+int nvmon_getNumberOfGPUs(void) __attribute__ ((visibility ("default") ));
+/*! \brief Get the number of configured eventSets in group (Nvmon)
+
+@param [in] groupId ID of group
+@return Number of eventSets
+*/
+int nvmon_getNumberOfEvents(int groupId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the number of configured metrics for group (Nvmon)
+
+@param [in] groupId ID of group
+@return Number of metrics
+*/
+int nvmon_getNumberOfMetrics(int groupId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the accumulated measurement time a group (Nvmon)
+
+@param [in] groupId ID of group
+@return Time in seconds the event group was measured
+*/
+double nvmon_getTimeOfGroup(int groupId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the last measurement time a group (Nvmon)
+
+@param [in] groupId ID of group
+@return Time in seconds the event group was measured the last time
+*/
+double nvmon_getLastTimeOfGroup(int groupId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the event name of the specified group and event (Nvmon)
+
+Get the metric name as defined in the performance group file
+@param [in] groupId ID of the group that should be read
+@param [in] eventId ID of the event that should be returned
+@return The event name or NULL in case of failure
+*/
+char* nvmon_getEventName(int groupId, int eventId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the counter name of the specified group and event (Nvmon)
+
+Get the counter name as defined in the performance group file
+@param [in] groupId ID of the group that should be read
+@param [in] eventId ID of the event of which the counter should be returned
+@return The counter name or NULL in case of failure
+*/
+char* nvmon_getCounterName(int groupId, int eventId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the metric name of the specified group and metric (Nvmon)
+
+Get the metric name as defined in the performance group file
+@param [in] groupId ID of the group that should be read
+@param [in] metricId ID of the metric that should be calculated
+@return The metric name or NULL in case of failure
+*/
+char* nvmon_getMetricName(int groupId, int metricId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the name group (Nvmon)
+
+Get the name of group. Either it is the name of the performance group or "Custom"
+@param [in] groupId ID of the group that should be read
+@return The group name or NULL in case of failure
+*/
+char* nvmon_getGroupName(int groupId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the short informational string of the specified group (Nvmon)
+
+Returns the short information string as defined by performance groups or "Custom"
+in case of custom event sets
+@param [in] groupId ID of the group that should be read
+@return The short information or NULL in case of failure
+*/
+char* nvmon_getGroupInfoShort(int groupId) __attribute__ ((visibility ("default") ));
+/*! \brief Get the long descriptive string of the specified group (Nvmon)
+
+Returns the long descriptive string as defined by performance groups or NULL
+in case of custom event sets
+@param [in] groupId ID of the group that should be read
+@return The long description or NULL in case of failure
+*/
+char* nvmon_getGroupInfoLong(int groupId) __attribute__ ((visibility ("default") ));
+
+/*! \brief Get all groups (Nvmon)
+
+Checks the configured performance group path for the current GPU and
+returns all found group names
+@param [in] gpuId Get groups for a specific GPU
+@param [out] groups List of group names
+@param [out] shortinfos List of short information string about group
+@param [out] longinfos List of long information string about group
+@return Amount of found performance groups
+*/
+int nvmon_getGroups(int gpuId, char*** groups, char*** shortinfos, char*** longinfos) __attribute__ ((visibility ("default") ));
+/*! \brief Free all group information (Nvmon)
+
+@param [in] nrgroups Number of groups
+@param [in] groups List of group names
+@param [in] shortinfos List of short information string about group
+@param [in] longinfos List of long information string about group
+*/
+int nvmon_returnGroups(int nrgroups, char** groups, char** shortinfos, char** longinfos) __attribute__ ((visibility ("default") ));
+
+
+
+/** @}*/
+
+#endif /* LIKWID_WITH_NVMON */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*LIKWID_H*/
diff --git a/collectors/likwidMetric.go b/collectors/likwidMetric.go
new file mode 100644
index 0000000..0f69756
--- /dev/null
+++ b/collectors/likwidMetric.go
@@ -0,0 +1,167 @@
+package collectors
+
+/*
+#cgo CFLAGS: -I./likwid
+#cgo LDFLAGS: -L./likwid -llikwid -llikwid-hwloc -lm
+#include <stdlib.h>
+#include <likwid.h>
+*/
+import "C"
+
+import (
+	"errors"
+	"fmt"
+	"log"
+	"strings"
+	"time"
+	"unsafe"
+)
+
+type LikwidCollector struct {
+	MetricCollector
+	cpulist  []C.int
+	sock2tid map[int]int
+	metrics  map[C.int]map[string]int
+	groups   map[string]C.int
+	init     bool
+}
+
+type LikwidMetric struct {
+	name         string
+	search       string
+	socket_scope bool
+	group_idx    int
+}
+
+const GROUPPATH = `/home/unrz139/Work/cc-metric-collector/collectors/likwid/groups`
+
+var likwid_metrics = map[string][]LikwidMetric{
+	"MEM_DP": {LikwidMetric{name: "mem_bw", search: "Memory bandwidth [MBytes/s]", socket_scope: true},
+		LikwidMetric{name: "pwr1", search: "Power [W]", socket_scope: true},
+		LikwidMetric{name: "pwr2", search: "Power DRAM [W]", socket_scope: true},
+		LikwidMetric{name: "flops_dp", search: "DP [MFLOP/s]", socket_scope: false}},
+	"FLOPS_SP": {LikwidMetric{name: "clock", search: "Clock [MHz]", socket_scope: false},
+		LikwidMetric{name: "cpi", search: "CPI", socket_scope: false},
+		LikwidMetric{name: "flops_sp", search: "SP [MFLOP/s]", socket_scope: false}},
+}
+
+func getMetricId(group C.int, search string) (int, error) {
+	for i := 0; i < int(C.perfmon_getNumberOfMetrics(group)); i++ {
+		mname := C.perfmon_getMetricName(group, C.int(i))
+		go_mname := C.GoString(mname)
+		if strings.Contains(go_mname, search) {
+			return i, nil
+		}
+
+	}
+	return -1, errors.New(fmt.Sprintf("Cannot find metric for search string '%s' in group %d", search, int(group)))
+}
+
+func getSocketCpus() map[C.int]int {
+	slist := SocketList()
+	var cpu C.int
+	outmap := make(map[C.int]int)
+	for _, s := range slist {
+		t := C.CString(fmt.Sprintf("S%d", s))
+		clen := C.cpustr_to_cpulist(t, &cpu, 1)
+		if int(clen) == 1 {
+			outmap[cpu] = s
+		}
+	}
+	return outmap
+}
+
+func (m *LikwidCollector) Init() {
+	m.name = "LikwidCollector"
+	m.setup()
+	cpulist := CpuList()
+	m.cpulist = make([]C.int, len(cpulist))
+	slist := getSocketCpus()
+
+	m.sock2tid = make(map[int]int)
+	for i, c := range cpulist {
+		m.cpulist[i] = C.int(c)
+		if sid, found := slist[m.cpulist[i]]; found {
+			m.sock2tid[sid] = i
+		}
+	}
+	m.metrics = make(map[C.int]map[string]int)
+	C.topology_init()
+	C.perfmon_init(C.int(len(m.cpulist)), &m.cpulist[0])
+	gpath := C.CString(GROUPPATH)
+	C.config_setGroupPath(gpath)
+	C.free(unsafe.Pointer(gpath))
+	m.init = true
+	m.groups = make(map[string]C.int)
+	for g, metrics := range likwid_metrics {
+		cstr := C.CString(g)
+		gid := C.perfmon_addEventSet(cstr)
+		if gid >= 0 {
+			m.groups[g] = gid
+			for i, metric := range metrics {
+				idx, err := getMetricId(gid, metric.search)
+				if err != nil {
+					log.Print(err)
+				} else {
+					likwid_metrics[g][i].group_idx = idx
+				}
+			}
+		} else {
+			log.Print("Failed to add events set ", g)
+		}
+		C.free(unsafe.Pointer(cstr))
+	}
+}
+
+func (m *LikwidCollector) Read(interval time.Duration) {
+	if m.init {
+		for gname, gid := range m.groups {
+			C.perfmon_setupCounters(gid)
+			C.perfmon_startCounters()
+			time.Sleep(interval)
+			C.perfmon_stopCounters()
+
+			for _, lmetric := range likwid_metrics[gname] {
+				if lmetric.socket_scope {
+					for sid, tid := range m.sock2tid {
+						res := C.perfmon_getLastMetric(gid, C.int(lmetric.group_idx), C.int(tid))
+						m.sockets[int(sid)][lmetric.name] = float64(res)
+						//	                    log.Print("Metric '", lmetric.name,"' on Socket ",int(sid)," returns ", m.sockets[int(sid)][lmetric.name])
+					}
+				} else {
+					for tid, cpu := range m.cpulist {
+						res := C.perfmon_getLastMetric(gid, C.int(lmetric.group_idx), C.int(tid))
+						m.cpus[int(cpu)][lmetric.name] = float64(res)
+						//            	        log.Print("Metric '", lmetric.name,"' on CPU ",int(cpu)," returns ", m.cpus[int(cpu)][lmetric.name])
+					}
+				}
+			}
+			for cpu := range m.cpus {
+				if flops_dp, found := m.cpus[cpu]["flops_dp"]; found {
+					if flops_sp, found := m.cpus[cpu]["flops_sp"]; found {
+						m.cpus[cpu]["flops_any"] = flops_dp.(float64) + flops_sp.(float64)
+					}
+				}
+			}
+			for sid := range m.sockets {
+				if pwr1, found := m.sockets[int(sid)]["pwr1"]; found {
+					if pwr2, found := m.sockets[int(sid)]["pwr2"]; found {
+						sum := pwr1.(float64) + pwr2.(float64)
+						if sum > 0 {
+							m.sockets[int(sid)]["power"] = sum
+						}
+						delete(m.sockets[int(sid)], "pwr2")
+					}
+					delete(m.sockets[int(sid)], "pwr1")
+				}
+			}
+		}
+	}
+}
+
+func (m *LikwidCollector) Close() {
+	C.perfmon_finalize()
+	C.topology_finalize()
+	m.init = false
+	return
+}
diff --git a/collectors/loadavgMetric.go b/collectors/loadavgMetric.go
new file mode 100644
index 0000000..560872d
--- /dev/null
+++ b/collectors/loadavgMetric.go
@@ -0,0 +1,39 @@
+package collectors
+
+import (
+	"io/ioutil"
+	"strconv"
+	"strings"
+	"time"
+)
+
+const LOADAVGFILE = `/proc/loadavg`
+
+type LoadavgCollector struct {
+	MetricCollector
+}
+
+func (m *LoadavgCollector) Init() {
+	m.name = "LoadavgCollector"
+	m.setup()
+}
+
+func (m *LoadavgCollector) Read(interval time.Duration) {
+	buffer, err := ioutil.ReadFile(string(LOADAVGFILE))
+
+	if err != nil {
+		return
+	}
+
+	ls := strings.Split(string(buffer), ` `)
+	loadOne, _ := strconv.ParseFloat(ls[0], 64)
+	m.node["load_one"] = float64(loadOne)
+	loadFive, _ := strconv.ParseFloat(ls[1], 64)
+	m.node["load_five"] = float64(loadFive)
+	loadFifteen, _ := strconv.ParseFloat(ls[2], 64)
+	m.node["load_fifteen"] = float64(loadFifteen)
+}
+
+func (m *LoadavgCollector) Close() {
+	return
+}
diff --git a/collectors/lustreMetric.go b/collectors/lustreMetric.go
new file mode 100644
index 0000000..7f94df7
--- /dev/null
+++ b/collectors/lustreMetric.go
@@ -0,0 +1,60 @@
+package collectors
+
+import (
+	"io/ioutil"
+	"log"
+	"strconv"
+	"strings"
+	"time"
+)
+
+const LUSTREFILE = `/proc/fs/lustre/llite/lnec-XXXXXX/stats`
+
+type LustreCollector struct {
+	MetricCollector
+}
+
+func (m *LustreCollector) Init() {
+	m.name = "LustreCollector"
+	m.setup()
+}
+
+func (m *LustreCollector) Read(interval time.Duration) {
+	buffer, err := ioutil.ReadFile(string(LUSTREFILE))
+
+	if err != nil {
+		log.Print(err)
+		return
+	}
+
+	for _, line := range strings.Split(string(buffer), "\n") {
+		lf := strings.Fields(line)
+		if len(lf) > 1 {
+			switch lf[0] {
+			case "read_bytes":
+				m.node["read_bytes"], err = strconv.ParseInt(lf[6], 0, 64)
+				m.node["read_requests"], err = strconv.ParseInt(lf[1], 0, 64)
+			case "write_bytes":
+				m.node["write_bytes"], err = strconv.ParseInt(lf[6], 0, 64)
+				m.node["write_requests"], err = strconv.ParseInt(lf[1], 0, 64)
+			case "open":
+				m.node["open"], err = strconv.ParseInt(lf[1], 0, 64)
+			case "close":
+				m.node["close"], err = strconv.ParseInt(lf[1], 0, 64)
+			case "setattr":
+				m.node["setattr"], err = strconv.ParseInt(lf[1], 0, 64)
+			case "getattr":
+				m.node["getattr"], err = strconv.ParseInt(lf[1], 0, 64)
+			case "statfs":
+				m.node["statfs"], err = strconv.ParseInt(lf[1], 0, 64)
+			case "inode_permission":
+				m.node["inode_permission"], err = strconv.ParseInt(lf[1], 0, 64)
+			}
+
+		}
+	}
+}
+
+func (m *LustreCollector) Close() {
+	return
+}
diff --git a/collectors/memavg/config.json b/collectors/memavg/config.json
deleted file mode 100644
index a1b8ffb..0000000
--- a/collectors/memavg/config.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "command": "read_memavg.sh"
-}
diff --git a/collectors/memavg/read_memavg.go b/collectors/memavg/read_memavg.go
deleted file mode 100644
index 9590878..0000000
--- a/collectors/memavg/read_memavg.go
+++ /dev/null
@@ -1,48 +0,0 @@
-package main
-import (
-    "strings"
-    "io/ioutil"
-    "fmt"
-    "time"
-    "os"
-    "strconv"
-    )
-
-func main() {
-    t := time.Now()
-    hostname, err := os.Hostname()
-    if err != nil {
-        fmt.Println("#", err)
-        os.Exit(1)
-    }
-    hostname = strings.Split(hostname, ".")[0]
-    data, err := ioutil.ReadFile("/proc/meminfo")
-    if err != nil {
-        fmt.Println("#", err)
-        os.Exit(1)
-        return
-    }
-    lines := strings.Split(string(data), "\n")
-    for _, l := range lines {
-        if strings.HasPrefix(l, "MemTotal") {
-            f := strings.Fields(l)
-            v, err := strconv.ParseInt(f[1], 10, 0)
-            if err == nil {
-                fmt.Printf("mem_total,hostname=%s value=%v %v\n", hostname, v*1024, t.UnixNano())
-            }
-        } else if strings.HasPrefix(l, "MemAvailable") {
-            f := strings.Fields(l)
-            v, err := strconv.ParseInt(f[1], 10, 0)
-            if err == nil {
-                fmt.Printf("mem_avail,hostname=%s value=%v %v\n", hostname, v*1024, t.UnixNano())
-            }
-        } else if strings.HasPrefix(l, "MemFree") {
-            f := strings.Fields(l)
-            v, err := strconv.ParseInt(f[1], 10, 0)
-            if err == nil {
-                fmt.Printf("mem_free,hostname=%s value=%v %v\n", hostname, v*1024, t.UnixNano())
-            }
-        }
-    }
-    return
-}
diff --git a/collectors/memavg/read_memavg.sh b/collectors/memavg/read_memavg.sh
deleted file mode 100755
index d2f3db1..0000000
--- a/collectors/memavg/read_memavg.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-
-TOTAL=$(grep "MemTotal" /proc/meminfo | awk '{print $2}')
-AVAIL=$(grep "MemAvailable" /proc/meminfo | awk '{print $2}')
-FREE=$(grep "MemFree" /proc/meminfo | awk '{print $2}')
-HOST=$(hostname -s)
-
-
-echo "mem_total,host=$HOST value=$TOTAL"
-echo "mem_avail,host=$HOST value=$AVAIL"
-echo "mem_free,host=$HOST value=$FREE"
diff --git a/collectors/memstatMetric.go b/collectors/memstatMetric.go
new file mode 100644
index 0000000..15c75c3
--- /dev/null
+++ b/collectors/memstatMetric.go
@@ -0,0 +1,54 @@
+package collectors
+
+import (
+	"errors"
+	"io/ioutil"
+	"log"
+	"strconv"
+	"strings"
+	"time"
+)
+
+const MEMSTATFILE = `/proc/meminfo`
+
+type MemstatCollector struct {
+	MetricCollector
+}
+
+func (m *MemstatCollector) Init() {
+	m.name = "MemstatCollector"
+	m.setup()
+}
+
+func (m *MemstatCollector) Read(interval time.Duration) {
+	buffer, err := ioutil.ReadFile(string(MEMSTATFILE))
+
+	if err != nil {
+		log.Print(err)
+		return
+	}
+
+	ll := strings.Split(string(buffer), "\n")
+	memstats := make(map[string]int64)
+
+	for _, line := range ll {
+		ls := strings.Split(line, `:`)
+		if len(ls) > 1 {
+			lv := strings.Fields(ls[1])
+			memstats[ls[0]], err = strconv.ParseInt(lv[0], 0, 64)
+		}
+	}
+
+	if _, exists := memstats[`MemTotal`]; !exists {
+		err = errors.New("Parse error")
+		log.Print(err)
+		return
+	}
+
+	memUsed := memstats[`MemTotal`] - (memstats[`MemFree`] + memstats[`Buffers`] + memstats[`Cached`])
+	m.node["mem_used"] = float64(memUsed) * 1.0e-3
+}
+
+func (m *MemstatCollector) Close() {
+	return
+}
diff --git a/collectors/metricCollector.go b/collectors/metricCollector.go
new file mode 100644
index 0000000..500058f
--- /dev/null
+++ b/collectors/metricCollector.go
@@ -0,0 +1,116 @@
+package collectors
+
+import (
+	"io/ioutil"
+	"log"
+	"strconv"
+	"strings"
+	"time"
+)
+
+type MetricGetter interface {
+	Name() string
+	Init()
+	Read(time.Duration)
+	Close()
+	GetNodeMetric() map[string]interface{}
+	GetSocketMetrics() map[int]map[string]interface{}
+	GetCpuMetrics() map[int]map[string]interface{}
+}
+
+type MetricCollector struct {
+	name    string
+	node    map[string]interface{}
+	sockets map[int]map[string]interface{}
+	cpus    map[int]map[string]interface{}
+}
+
+func (c *MetricCollector) Name() string {
+	return c.name
+}
+
+func (c *MetricCollector) GetNodeMetric() map[string]interface{} {
+	return c.node
+}
+
+func (c *MetricCollector) GetSocketMetrics() map[int]map[string]interface{} {
+	return c.sockets
+}
+
+func (c *MetricCollector) GetCpuMetrics() map[int]map[string]interface{} {
+	return c.cpus
+}
+
+func (c *MetricCollector) setup() error {
+	slist := SocketList()
+	clist := CpuList()
+	c.node = make(map[string]interface{})
+	c.sockets = make(map[int]map[string]interface{}, len(slist))
+	for _, s := range slist {
+		c.sockets[s] = make(map[string]interface{})
+	}
+	c.cpus = make(map[int]map[string]interface{}, len(clist))
+	for _, s := range clist {
+		c.cpus[s] = make(map[string]interface{})
+	}
+	return nil
+}
+
+func intArrayContains(array []int, str int) (int, bool) {
+	for i, a := range array {
+		if a == str {
+			return i, true
+		}
+	}
+	return -1, false
+}
+
+func SocketList() []int {
+	buffer, err := ioutil.ReadFile("/proc/cpuinfo")
+	if err != nil {
+		log.Print(err)
+		return nil
+	}
+	ll := strings.Split(string(buffer), "\n")
+	var packs []int
+	for _, line := range ll {
+		if strings.HasPrefix(line, "physical id") {
+			lv := strings.Fields(line)
+			id, err := strconv.ParseInt(lv[3], 10, 32)
+			if err != nil {
+				log.Print(err)
+				return packs
+			}
+			_, found := intArrayContains(packs, int(id))
+			if !found {
+				packs = append(packs, int(id))
+			}
+		}
+	}
+	return packs
+}
+
+func CpuList() []int {
+	buffer, err := ioutil.ReadFile("/proc/cpuinfo")
+	if err != nil {
+		log.Print(err)
+		return nil
+	}
+	ll := strings.Split(string(buffer), "\n")
+	var cpulist []int
+	for _, line := range ll {
+		if strings.HasPrefix(line, "processor") {
+			lv := strings.Fields(line)
+			id, err := strconv.ParseInt(lv[2], 10, 32)
+			if err != nil {
+				log.Print(err)
+				return cpulist
+			}
+			_, found := intArrayContains(cpulist, int(id))
+			if !found {
+				cpulist = append(cpulist, int(id))
+			}
+		}
+	}
+	return cpulist
+}
diff --git a/collectors/netstatMetric.go b/collectors/netstatMetric.go
new file mode 100644
index 0000000..623cf7b
--- /dev/null
+++ b/collectors/netstatMetric.go
@@ -0,0 +1,58 @@
+package collectors
+
+import (
+	"fmt"
+	"io/ioutil"
+	"log"
+	"strconv"
+	"strings"
+	"time"
+)
+
+const NETSTATFILE = `/proc/net/dev`
+
+type NetstatCollector struct {
+	MetricCollector
+}
+
+func (m *NetstatCollector) Init() {
+	m.name = "NetstatCollector"
+	m.setup()
+}
+
+func (m *NetstatCollector) Read(interval time.Duration) {
+	data, err := ioutil.ReadFile(string(NETSTATFILE))
+	if err != nil {
+		log.Print(err.Error())
+		return
+	}
+	var matches = map[int]string{
+		1:  "bytes_in",
+		9:  "bytes_out",
+		2:  "pkts_in",
+		10: "pkts_out",
+	}
+
+	lines := strings.Split(string(data), "\n")
+	for _, l := range lines {
+		if !strings.Contains(l, ":") {
+			continue
+		}
+		f := strings.Fields(l)
+		dev := f[0][0 : len(f[0])-1]
+		if dev == "lo" {
+			continue
+		}
+		for i, name := range matches {
+			v, err := strconv.ParseInt(f[i], 10, 0)
+			if err == nil {
+				m.node[fmt.Sprintf("%s_%s", dev, name)] = float64(v) * 1.0e-3
+			}
+		}
+	}
+
+}
+
+func (m *NetstatCollector) Close() {
+	return
+}
diff --git a/config.json b/config.json
index 1c836ac..f2bba58 100644
--- a/config.json
+++ b/config.json
@@ -1,33 +1,20 @@
 {
     "sink": {
         "user": "admin",
-        "password": "12345"
+        "password": "12345",
+        "host": "localhost",
+        "port": "8080",
+        "database": "testdb",
+        "type": "stdout"
     },
-    "host": "localhost",
-    "port": "8080",
-    "report": {
-      "levels": ["core","node"],
-      "interval": 120
-      },
-   "schedule": {
-      "core": {
-         "frequency": 30,
-         "duration": 10
-      },
-      "node":{
-         "frequency": 60,
-         "duration": 20
-      }
-   },
-   "metrics": [
-      "ipc",
-      "flops_any",
-      "clock",
-      "load",
-      "mem_bw",
-      "mem_used",
-      "net_bw",
-      "file_bw"
-   ],
-   "collector_path": "./collectors"
+    "interval" : 3,
+    "duration" : 1,
+    "collectors": [
+        "memstat",
+        "likwid",
+        "loadavg",
+        "netstat",
+        "ibstat",
+        "lustrestat"
+    ]
 }
diff --git a/go.mod b/go.mod
index 9771f8d..90b6fe0 100644
--- a/go.mod
+++ b/go.mod
@@ -2,4 +2,7 @@ module github.com/ClusterCockpit/cc-metric-collector
 
 go 1.16
 
-require github.com/influxdata/line-protocol v0.0.0-20210311194329-9aa0e372d097
+require (
+	github.com/influxdata/influxdb-client-go/v2 v2.2.2
+	github.com/influxdata/line-protocol v0.0.0-20210311194329-9aa0e372d097
+)
diff --git a/go.sum b/go.sum
index 3a13697..f63eae9 100644
--- a/go.sum
+++ b/go.sum
@@ -1,2 +1,61 @@
+github.com/cyberdelia/templates v0.0.0-20141128023046-ca7fffd4298c/go.mod h1:GyV+0YP4qX0UQ7r2MoYZ+AvYDp12OF5yg4q8rGnyNh4=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/deepmap/oapi-codegen v1.3.13 h1:9HKGCsdJqE4dnrQ8VerFS0/1ZOJPmAhN+g8xgp8y3K4=
+github.com/deepmap/oapi-codegen v1.3.13/go.mod h1:WAmG5dWY8/PYHt4vKxlt90NsbHMAOCiteYKZMiIRfOo=
+github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
+github.com/getkin/kin-openapi v0.13.0/go.mod h1:WGRs2ZMM1Q8LR1QBEwUxC6RJEfaBcD0s+pcEVXFuAjw=
+github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
+github.com/go-chi/chi v4.0.2+incompatible/go.mod h1:eB3wogJHnLi3x/kFX2A+IbTBlXxmMeXJVKy9tTv1XzQ=
+github.com/golangci/lint-1 v0.0.0-20181222135242-d2cdd8c08219/go.mod h1:/X8TswGSh1pIozq4ZwCfxS0WA5JGXguxk94ar/4c87Y=
+github.com/influxdata/influxdb-client-go v1.4.0 h1:+KavOkwhLClHFfYcJMHHnTL5CZQhXJzOm5IKHI9BqJk=
+github.com/influxdata/influxdb-client-go/v2 v2.2.2 h1:O0CGIuIwQafvAxttAJ/VqMKfbWWn2Mt8rbOmaM2Zj4w=
+github.com/influxdata/influxdb-client-go/v2 v2.2.2/go.mod h1:fa/d1lAdUHxuc1jedx30ZfNG573oQTQmUni3N6pcW+0=
+github.com/influxdata/line-protocol v0.0.0-20200327222509-2487e7298839/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo=
 github.com/influxdata/line-protocol v0.0.0-20210311194329-9aa0e372d097 h1:vilfsDSy7TDxedi9gyBkMvAirat/oRcL0lFdJBf6tdM=
 github.com/influxdata/line-protocol v0.0.0-20210311194329-9aa0e372d097/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo=
+github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
+github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
+github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
+github.com/labstack/echo/v4 v4.1.11/go.mod h1:i541M3Fj6f76NZtHSj7TXnyM8n2gaodfvfxNnFqi74g=
+github.com/labstack/gommon v0.3.0/go.mod h1:MULnywXg0yavhxWKc+lOruYdAhDwPK9wf0OL7NoOu+k=
+github.com/matryer/moq v0.0.0-20190312154309-6cfb0558e1bd/go.mod h1:9ELz6aaclSIGnZBoaSLZ3NAl1VTufbOrXBPvtcy6WiQ=
+github.com/mattn/go-colorable v0.1.2/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE=
+github.com/mattn/go-colorable v0.1.4/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE=
+github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s=
+github.com/mattn/go-isatty v0.0.9/go.mod h1:YNRxwqDuOph6SZLI9vUUz6OYw3QyUt7WiY2yME+cCiQ=
+github.com/mattn/go-isatty v0.0.10/go.mod h1:qgIWMr58cqv1PHHyhnkY9lrL7etaEgOFcMEpPG5Rm84=
+github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
+github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
+github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
+github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
+github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
+github.com/valyala/fasttemplate v1.0.1/go.mod h1:UQGH1tvbgY+Nz5t2n7tXsz52dQxojPUpymEIMZ47gx8=
+github.com/valyala/fasttemplate v1.1.0/go.mod h1:UQGH1tvbgY+Nz5t2n7tXsz52dQxojPUpymEIMZ47gx8=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/crypto v0.0.0-20191112222119-e1110fd1c708/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20191112182307-2180aed22343 h1:00ohfJ4K98s3m6BGUoBd8nyfp4Yl0GoIKvw5abItTjI=
+golang.org/x/net v0.0.0-20191112182307-2180aed22343/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20191008105621-543471e840be/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20191115151921-52ab43148777/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
+gopkg.in/yaml.v2 v2.3.0 h1:clyUAQHOM3G0M3f5vQj7LuJrETvjVot3Z5el9nffUtU=
+gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
diff --git a/metric-collector.go b/metric-collector.go
new file mode 100644
index 0000000..3021cce
--- /dev/null
+++ b/metric-collector.go
@@ -0,0 +1,211 @@
+package main
+
+import (
+	"encoding/json"
+	"fmt"
+	"github.com/ClusterCockpit/cc-metric-collector/collectors"
+	"github.com/ClusterCockpit/cc-metric-collector/sinks"
+	"log"
+	"os"
+	"os/signal"
+	"strings"
+	"sync"
+	"time"
+)
+
+// List of provided collectors. Which collector should be run can be
+// configured at 'collectors' list  in 'config.json'.
+var Collectors = map[string]collectors.MetricGetter{
+	"likwid":     &collectors.LikwidCollector{},
+	"loadavg":    &collectors.LoadavgCollector{},
+	"memstat":    &collectors.MemstatCollector{},
+	"netstat":    &collectors.NetstatCollector{},
+	"ibstat":     &collectors.InfinibandCollector{},
+	"lustrestat": &collectors.LustreCollector{},
+}
+
+var Sinks = map[string]sinks.SinkFuncs{
+	"influxdb": &sinks.InfluxSink{},
+	"stdout": &sinks.StdoutSink{},
+}
+
+// Structure of the configuration file
+type GlobalConfig struct {
+	Sink struct {
+		User     string `json:"user"`
+		Password string `json:"password"`
+		Host     string `json:"host"`
+		Port     string `json:"port"`
+		Database string `json:"database"`
+		Type     string `json:"type"`
+	} `json:"sink"`
+	Interval   int      `json:"interval"`
+	Duration   int      `json:"duration"`
+	Collectors []string `json:"collectors"`
+}
+
+// Load JSON configuration file
+func LoadConfiguration(file string, config *GlobalConfig) error {
+	configFile, err := os.Open(file)
+	defer configFile.Close()
+	if err != nil {
+		fmt.Println(err.Error())
+	}
+	jsonParser := json.NewDecoder(configFile)
+	jsonParser.Decode(config)
+	return err
+}
+
+// Register an interrupt handler for Ctrl+C and similar. At signal,
+// all collectors are closed
+func shutdown(wg *sync.WaitGroup, config *GlobalConfig, sink sinks.SinkFuncs) {
+	sigs := make(chan os.Signal, 1)
+	signal.Notify(sigs, os.Interrupt)
+
+	go func(wg *sync.WaitGroup) {
+		<-sigs
+		log.Print("Shutdown...")
+		for _, c := range config.Collectors {
+			col := Collectors[c]
+			log.Print("Stop ", col.Name())
+			col.Close()
+		}
+		time.Sleep(1 * time.Second)
+		sink.Close()
+		wg.Done()
+	}(wg)
+}
+
+func main() {
+	var config GlobalConfig
+	var wg sync.WaitGroup
+	wg.Add(1)
+	host, err := os.Hostname()
+	if err != nil {
+		log.Print(err)
+		return
+	}
+
+	// Load and check configuration
+	LoadConfiguration("config.json", &config)
+	if config.Interval <= 0 || time.Duration(config.Interval)*time.Second <= 0 {
+		log.Print("Configuration value 'interval' must be greater than zero")
+		return
+	}
+	if config.Duration <= 0 {
+		log.Print("Configuration value 'duration' must be greater than zero")
+		return
+	}
+	if len(config.Collectors) == 0 {
+		var keys []string
+		for k := range Collectors {
+			keys = append(keys, k)
+		}
+		log.Print("Configuration value 'collectors' does not contain any collector. Available: ", strings.Join(keys, ", "))
+		return
+	}
+	for _, name := range config.Collectors {
+		if _, found := Collectors[name]; !found {
+			log.Print("Invalid collector '", name, "' in configuration")
+			return
+		}
+	}
+	if _, found := Sinks[config.Sink.Type]; !found {
+		log.Print("Invalid sink type '", config.Sink.Type, "' in configuration")
+		return
+	}
+	// Setup sink
+	sink := Sinks[config.Sink.Type]
+	err = sink.Init(config.Sink.Host, config.Sink.Port, config.Sink.User, config.Sink.Password, config.Sink.Database)
+	if err != nil {
+		return
+	}
+
+	// Register interrupt handler
+	shutdown(&wg, &config, sink)
+
+	// Initialize all collectors
+	for _, c := range config.Collectors {
+		col := Collectors[c]
+		col.Init()
+		log.Print("Start ", col.Name())
+	}
+
+	// Setup up ticker loop
+	log.Print("Running loop every ", time.Duration(config.Interval)*time.Second)
+	ticker := time.NewTicker(time.Duration(config.Interval) * time.Second)
+	done := make(chan bool)
+
+	// Storage for all node metrics
+	nodeFields := make(map[string]interface{})
+
+	// Storage for all socket metrics
+	slist := collectors.SocketList()
+	socketsFields := make(map[int]map[string]interface{}, len(slist))
+	for _, s := range slist {
+		socketsFields[s] = make(map[string]interface{})
+	}
+
+	// Storage for all CPU metrics
+	clist := collectors.CpuList()
+	cpuFields := make(map[int]map[string]interface{}, len(clist))
+	for _, s := range clist {
+		cpuFields[s] = make(map[string]interface{})
+	}
+
+	go func() {
+		for {
+			select {
+			case <-done:
+				return
+			case t := <-ticker.C:
+				// Count how many socket and cpu metrics are returned
+				scount := 0
+				ccount := 0
+
+				// Read all collectors are sort the results in the right
+				// storage locations
+				for _, c := range config.Collectors {
+					col := Collectors[c]
+					col.Read(time.Duration(config.Duration))
+
+					for key, val := range col.GetNodeMetric() {
+						nodeFields[key] = val
+					}
+					for sid, socket := range col.GetSocketMetrics() {
+						for key, val := range socket {
+							socketsFields[sid][key] = val
+							scount++
+						}
+					}
+					for cid, cpu := range col.GetCpuMetrics() {
+						for key, val := range cpu {
+							cpuFields[cid][key] = val
+							ccount++
+						}
+					}
+				}
+
+				// Send out node metrics
+				sink.Write("node", map[string]string{"host": host}, nodeFields, t)
+
+				// Send out socket metrics (if any)
+				if scount > 0 {
+					for sid, socket := range socketsFields {
+						sink.Write("socket", map[string]string{"socket": fmt.Sprintf("%d", sid), "host": host}, socket, t)
+					}
+				}
+
+				// Send out CPU metrics (if any)
+				if ccount > 0 {
+					for cid, cpu := range cpuFields {
+						sink.Write("cpu", map[string]string{"cpu": fmt.Sprintf("%d", cid), "host": host}, cpu, t)
+					}
+				}
+			}
+		}
+	}()
+
+	// Wait until receiving an interrupt
+	wg.Wait()
+}
diff --git a/sinks/influxSink.go b/sinks/influxSink.go
new file mode 100644
index 0000000..628a403
--- /dev/null
+++ b/sinks/influxSink.go
@@ -0,0 +1,44 @@
+package sinks
+
+import (
+	"context"
+	"fmt"
+	influxdb2 "github.com/influxdata/influxdb-client-go/v2"
+	influxdb2Api "github.com/influxdata/influxdb-client-go/v2/api"
+	"log"
+	"time"
+)
+
+type InfluxSink struct {
+	Sink
+	client       influxdb2.Client
+	writeApi     influxdb2Api.WriteAPIBlocking
+	retPolicy    string
+	organization string
+}
+
+func (s *InfluxSink) Init(host string, port string, user string, password string, database string) error {
+	s.host = host
+	s.port = port
+	s.user = user
+	s.password = password
+	s.database = database
+	s.organization = ""
+	uri := fmt.Sprintf("http://%s:%s", host, port)
+	auth := fmt.Sprintf("%s:%s", user, password)
+	log.Print("Using URI ", uri, " for connection")
+	s.client = influxdb2.NewClient(uri, auth)
+	s.writeApi = s.client.WriteAPIBlocking(s.organization, s.database)
+	return nil
+}
+
+func (s *InfluxSink) Write(measurement string, tags map[string]string, fields map[string]interface{}, t time.Time) error {
+	p := influxdb2.NewPoint(measurement, tags, fields, t)
+	err := s.writeApi.WritePoint(context.Background(), p)
+	return err
+}
+
+func (s *InfluxSink) Close() {
+	log.Print("Closing InfluxDB connection")
+	s.client.Close()
+}
diff --git a/sinks/sink.go b/sinks/sink.go
new file mode 100644
index 0000000..1efdc81
--- /dev/null
+++ b/sinks/sink.go
@@ -0,0 +1,19 @@
+package sinks
+
+import (
+	"time"
+)
+
+type Sink struct {
+	host     string
+	port     string
+	user     string
+	password string
+	database string
+}
+
+type SinkFuncs interface {
+	Init(host string, port string, user string, password string, database string) error
+	Write(measurement string, tags map[string]string, fields map[string]interface{}, t time.Time) error
+	Close()
+}
diff --git a/sinks/stdoutSink.go b/sinks/stdoutSink.go
new file mode 100644
index 0000000..d110411
--- /dev/null
+++ b/sinks/stdoutSink.go
@@ -0,0 +1,44 @@
+package sinks
+
+import (
+	"fmt"
+	"math"
+	"strings"
+	"time"
+)
+
+type StdoutSink struct {
+	Sink
+}
+
+func (s *StdoutSink) Init(host string, port string, user string, password string, database string) error {
+	s.host = host
+	s.port = port
+	s.user = user
+	s.password = password
+	s.database = database
+	return nil
+}
+
+func (s *StdoutSink) Write(measurement string, tags map[string]string, fields map[string]interface{}, t time.Time) error {
+	var tagsstr []string
+	var fieldstr []string
+	for k, v := range tags {
+		tagsstr = append(tagsstr, fmt.Sprintf("%s=%s", k, v))
+	}
+	for k, v := range fields {
+		if !math.IsNaN(v.(float64)) {
+			fieldstr = append(fieldstr, fmt.Sprintf("%s=%v", k, v.(float64)))
+		}
+	}
+	if len(tagsstr) > 0 {
+		fmt.Printf("%s,%s %s %d\n", measurement, strings.Join(tagsstr, ","), strings.Join(fieldstr, ","), t.Unix())
+	} else {
+		fmt.Printf("%s %s %d\n", measurement, strings.Join(fieldstr, ","), t.Unix())
+	}
+	return nil
+}
+
+func (s *StdoutSink) Close() {
+	return
+}