Merge latest development changes to main branch (#79)

* Cleanup: Remove unused code * Use Golang duration parser for 'interval' and 'duration' in main config * Update handling of LIKWID headers. Download only if not already present in the system. Fixes #73 * Units with cc-units (#64) * Add option to normalize units with cc-unit * Add unit conversion to router * Add option to change unit prefix in the router * Add to MetricRouter README * Add order of operations in router to README * Use second add_tags/del_tags only if metric gets renamed * Skip disks in DiskstatCollector that have size=0 * Check readability of sensor files in TempCollector * Fix for --once option * Rename `cpu` type to `hwthread` (#69) * Rename 'cpu' type to 'hwthread' to avoid naming clashes with MetricStore and CC-Webfrontend * Collectors in parallel (#74) * Provide info to CollectorManager whether the collector can be executed in parallel with others * Split serial and parallel collectors. Read in parallel first * Update NvidiaCollector with new metrics, MIG and NvLink support (#75) * CC topology module update (#76) * Rename CPU to hardware thread, write some comments * Do renaming in other parts * Remove CpuList and SocketList function from metricCollector. Available in ccTopology * Option to use MIG UUID as subtype-id in NvidiaCollector * Option to use MIG slice name as subtype-id in NvidiaCollector * MetricRouter: Fix JSON in README * Fix for Github Action to really use the selected version * Remove Ganglia installation in runonce Action and add Go 1.18 * Fix daemon options in init script * Add separate go.mod files to use it with deprecated 1.16 * Minor updates for Makefiles * fix string comparison * AMD ROCm SMI collector (#77) * Add collector for AMD ROCm SMI metrics * Fix import path * Fix imports * Remove Board Number * store GPU index explicitly * Remove board number from description * Use http instead of ftp to download likwid * Fix serial number in rocmCollector * Improved http sink (#78) * automatic flush in NatsSink * tweak default options of HttpSink * shorter cirt. section and retries for HttpSink * fix error handling * Remove file added by mistake. * Use http instead of ftp to download likwid * Fix serial number in rocmCollector Co-authored-by: Thomas Roehl <thomas.roehl@fau.de> Co-authored-by: Holger Obermaier <40787752+ho-ob@users.noreply.github.com> Co-authored-by: Lou <lou.knauer@gmx.de>
2025-12-23 15:46:16 +01:00 · 2022-06-08 15:25:40 +02:00
parent 186a62a86b
commit 8d85bd53f1
51 changed files with 2097 additions and 705 deletions
--- a/collectors/Makefile
+++ b/collectors/Makefile
@@ -1,22 +1,28 @@
-
-all: likwid
-
-
 # LIKWID version
 LIKWID_VERSION = 5.2.1
+LIKWID_INSTALLED_FOLDER=$(shell dirname $(shell which likwid-topology 2>/dev/null) 2>/dev/null)
+
+LIKWID_FOLDER="$(shell pwd)/likwid"
+
+all: $(LIKWID_FOLDER)/likwid.h

 .ONESHELL:
-.PHONY: likwid
-likwid:
-	INSTALL_FOLDER="$${PWD}/likwid"
-	BUILD_FOLDER="$${PWD}/likwidbuild"
-	if [ -d $${INSTALL_FOLDER} ]; then rm -r $${INSTALL_FOLDER}; fi
-	mkdir --parents --verbose  $${INSTALL_FOLDER} $${BUILD_FOLDER}
-	wget -P "$${BUILD_FOLDER}" ftp://ftp.rrze.uni-erlangen.de/mirrors/likwid/likwid-$(LIKWID_VERSION).tar.gz
-	tar -C $${BUILD_FOLDER} -xf $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION).tar.gz
-	install -Dpm 0644 $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes/likwid*.h $${INSTALL_FOLDER}/
-	install -Dpm 0644 $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes/bstrlib.h $${INSTALL_FOLDER}/
-	rm -r $${BUILD_FOLDER}
+.PHONY: $(LIKWID_FOLDER)/likwid.h
+$(LIKWID_FOLDER)/likwid.h:
+	if [ "$(LIKWID_INSTALLED_FOLDER)" != "" ]; then \
+		BASE="$(LIKWID_INSTALLED_FOLDER)/../include"; \
+		mkdir -p $(LIKWID_FOLDER); \
+		cp $$BASE/*.h $(LIKWID_FOLDER); \
+	else \
+		BUILD_FOLDER="$${PWD}/likwidbuild"; \
+		if [ -d $(LIKWID_FOLDER) ]; then rm -r $(LIKWID_FOLDER); fi; \
+		mkdir --parents --verbose  $(LIKWID_FOLDER) $${BUILD_FOLDER}; \
+		wget -P "$${BUILD_FOLDER}" http://ftp.rrze.uni-erlangen.de/mirrors/likwid/likwid-$(LIKWID_VERSION).tar.gz; \
+		tar -C $${BUILD_FOLDER} -xf $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION).tar.gz; \
+		install -Dpm 0644 $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes/likwid*.h $(LIKWID_FOLDER)/; \
+		install -Dpm 0644 $${BUILD_FOLDER}/likwid-$(LIKWID_VERSION)/src/includes/bstrlib.h $(LIKWID_FOLDER)/; \
+		rm -r $${BUILD_FOLDER}; \
+	fi


 clean:
--- a/collectors/README.md
+++ b/collectors/README.md
@@ -39,6 +39,7 @@ In contrast to the configuration files for sinks and receivers, the collectors c
 * [`gpfs`](./gpfsMetric.md)
 * [`beegfs_meta`](./beegfsmetaMetric.md)
 * [`beegfs_storage`](./beegfsstorageMetric.md)
+* [`rocm_smi`](./rocmsmiMetric.md)

 ## Todos

--- a/collectors/beegfsmetaMetric.go
+++ b/collectors/beegfsmetaMetric.go
@@ -55,6 +55,7 @@ func (m *BeegfsMetaCollector) Init(config json.RawMessage) error {

 	m.name = "BeegfsMetaCollector"
 	m.setup()
+	m.parallel = true
 	// Set default beegfs-ctl binary

 	m.config.Beegfs = DEFAULT_BEEGFS_CMD
--- a/collectors/beegfsstorageMetric.go
+++ b/collectors/beegfsstorageMetric.go
@@ -48,6 +48,7 @@ func (m *BeegfsStorageCollector) Init(config json.RawMessage) error {

 	m.name = "BeegfsStorageCollector"
 	m.setup()
+	m.parallel = true
 	// Set default beegfs-ctl binary

 	m.config.Beegfs = DEFAULT_BEEGFS_CMD
--- a/collectors/collectorManager.go
+++ b/collectors/collectorManager.go
@@ -14,39 +14,43 @@ import (
 // Map of all available metric collectors
 var AvailableCollectors = map[string]MetricCollector{

-	"likwid":           new(LikwidCollector),
-	"loadavg":          new(LoadavgCollector),
-	"memstat":          new(MemstatCollector),
-	"netstat":          new(NetstatCollector),
-	"ibstat":           new(InfinibandCollector),
-	"lustrestat":       new(LustreCollector),
-	"cpustat":          new(CpustatCollector),
-	"topprocs":         new(TopProcsCollector),
-	"nvidia":           new(NvidiaCollector),
-	"customcmd":        new(CustomCmdCollector),
-	"iostat":           new(IOstatCollector),
-	"diskstat":         new(DiskstatCollector),
-	"tempstat":         new(TempCollector),
-	"ipmistat":         new(IpmiCollector),
-	"gpfs":             new(GpfsCollector),
-	"cpufreq":          new(CPUFreqCollector),
-	"cpufreq_cpuinfo":  new(CPUFreqCpuInfoCollector),
-	"nfs3stat":         new(Nfs3Collector),
-	"nfs4stat":         new(Nfs4Collector),
-	"numastats":        new(NUMAStatsCollector),
-	"beegfs_meta":      new(BeegfsMetaCollector),
-	"beegfs_storage":   new(BeegfsStorageCollector),
+	"likwid":          new(LikwidCollector),
+	"loadavg":         new(LoadavgCollector),
+	"memstat":         new(MemstatCollector),
+	"netstat":         new(NetstatCollector),
+	"ibstat":          new(InfinibandCollector),
+	"lustrestat":      new(LustreCollector),
+	"cpustat":         new(CpustatCollector),
+	"topprocs":        new(TopProcsCollector),
+	"nvidia":          new(NvidiaCollector),
+	"customcmd":       new(CustomCmdCollector),
+	"iostat":          new(IOstatCollector),
+	"diskstat":        new(DiskstatCollector),
+	"tempstat":        new(TempCollector),
+	"ipmistat":        new(IpmiCollector),
+	"gpfs":            new(GpfsCollector),
+	"cpufreq":         new(CPUFreqCollector),
+	"cpufreq_cpuinfo": new(CPUFreqCpuInfoCollector),
+	"nfs3stat":        new(Nfs3Collector),
+	"nfs4stat":        new(Nfs4Collector),
+	"numastats":       new(NUMAStatsCollector),
+	"beegfs_meta":     new(BeegfsMetaCollector),
+	"beegfs_storage":  new(BeegfsStorageCollector),
+	"rocm_smi":        new(RocmSmiCollector),
 }

 // Metric collector manager data structure
 type collectorManager struct {
-	collectors []MetricCollector          // List of metric collectors to use
-	output     chan lp.CCMetric           // Output channels
-	done       chan bool                  // channel to finish / stop metric collector manager
-	ticker     mct.MultiChanTicker        // periodically ticking once each interval
-	duration   time.Duration              // duration (for metrics that measure over a given duration)
-	wg         *sync.WaitGroup            // wait group for all goroutines in cc-metric-collector
-	config     map[string]json.RawMessage // json encoded config for collector manager
+	collectors   []MetricCollector          // List of metric collectors to read in parallel
+	serial       []MetricCollector          // List of metric collectors to read serially
+	output       chan lp.CCMetric           // Output channels
+	done         chan bool                  // channel to finish / stop metric collector manager
+	ticker       mct.MultiChanTicker        // periodically ticking once each interval
+	duration     time.Duration              // duration (for metrics that measure over a given duration)
+	wg           *sync.WaitGroup            // wait group for all goroutines in cc-metric-collector
+	config       map[string]json.RawMessage // json encoded config for collector manager
+	collector_wg sync.WaitGroup             // internally used wait group for the parallel reading of collector
+	parallel_run bool                       // Flag whether the collectors are currently read in parallel
 }

 // Metric collector manager access functions
@@ -66,6 +70,7 @@ type CollectorManager interface {
 // Initialization is done for all configured collectors
 func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Duration, wg *sync.WaitGroup, collectConfigFile string) error {
 	cm.collectors = make([]MetricCollector, 0)
+	cm.serial = make([]MetricCollector, 0)
 	cm.output = nil
 	cm.done = make(chan bool)
 	cm.wg = wg
@@ -100,7 +105,11 @@ func (cm *collectorManager) Init(ticker mct.MultiChanTicker, duration time.Durat
 			continue
 		}
 		cclog.ComponentDebug("CollectorManager", "ADD COLLECTOR", collector.Name())
-		cm.collectors = append(cm.collectors, collector)
+		if collector.Parallel() {
+			cm.collectors = append(cm.collectors, collector)
+		} else {
+			cm.serial = append(cm.serial, collector)
+		}
 	}
 	return nil
 }
@@ -116,6 +125,10 @@ func (cm *collectorManager) Start() {
 		// Collector manager is done
 		done := func() {
 			// close all metric collectors
+			if cm.parallel_run {
+				cm.collector_wg.Wait()
+				cm.parallel_run = false
+			}
 			for _, c := range cm.collectors {
 				c.Close()
 			}
@@ -130,7 +143,26 @@ func (cm *collectorManager) Start() {
 				done()
 				return
 			case t := <-tick:
+				cm.parallel_run = true
 				for _, c := range cm.collectors {
+					// Wait for done signal or execute the collector
+					select {
+					case <-cm.done:
+						done()
+						return
+					default:
+						// Read metrics from collector c via goroutine
+						cclog.ComponentDebug("CollectorManager", c.Name(), t)
+						cm.collector_wg.Add(1)
+						go func(myc MetricCollector) {
+							myc.Read(cm.duration, cm.output)
+							cm.collector_wg.Done()
+						}(c)
+					}
+				}
+				cm.collector_wg.Wait()
+				cm.parallel_run = false
+				for _, c := range cm.serial {
 					// Wait for done signal or execute the collector
 					select {
 					case <-cm.done:
--- a/collectors/cpufreqCpuinfoMetric.go
+++ b/collectors/cpufreqCpuinfoMetric.go
@@ -48,6 +48,7 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
 	m.setup()

 	m.name = "CPUFreqCpuInfoCollector"
+	m.parallel = true
 	m.meta = map[string]string{
 		"source": m.name,
 		"group":  "CPU",
@@ -150,7 +151,7 @@ func (m *CPUFreqCpuInfoCollector) Init(config json.RawMessage) error {
 		t.numNonHT = numNonHT
 		t.numNonHT_int = numNonHT_int
 		t.tagSet = map[string]string{
-			"type":       "cpu",
+			"type":       "hwthread",
 			"type-id":    t.processor,
 			"package_id": t.physicalPackageID,
 		}
--- a/collectors/cpufreqCpuinfoMetric.md
+++ b/collectors/cpufreqCpuinfoMetric.md
@@ -4,7 +4,7 @@
  "cpufreq_cpuinfo": {}
 ```

-The `cpufreq_cpuinfo` collector reads the clock frequency from `/proc/cpuinfo` and outputs a handful **cpu** metrics.
+The `cpufreq_cpuinfo` collector reads the clock frequency from `/proc/cpuinfo` and outputs a handful **hwthread** metrics.

 Metrics:
 * `cpufreq`
--- a/collectors/cpufreqMetric.go
+++ b/collectors/cpufreqMetric.go
@@ -53,6 +53,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {

 	m.name = "CPUFreqCollector"
 	m.setup()
+	m.parallel = true
 	if len(config) > 0 {
 		err := json.Unmarshal(config, &m.config)
 		if err != nil {
@@ -161,7 +162,7 @@ func (m *CPUFreqCollector) Init(config json.RawMessage) error {
 		t.numNonHT = numNonHT
 		t.numNonHT_int = numNonHT_int
 		t.tagSet = map[string]string{
-			"type":       "cpu",
+			"type":       "hwthread",
 			"type-id":    t.processor,
 			"package_id": t.physicalPackageID,
 		}
--- a/collectors/cpufreqMetric.md
+++ b/collectors/cpufreqMetric.md
@@ -5,7 +5,7 @@
  }
 ```

-The `cpufreq` collector reads the clock frequency from `/sys/devices/system/cpu/cpu*/cpufreq` and outputs a handful **cpu** metrics.
+The `cpufreq` collector reads the clock frequency from `/sys/devices/system/cpu/cpu*/cpufreq` and outputs a handful **hwthread** metrics.

 Metrics:
 * `cpufreq`
--- a/collectors/cpustatMetric.go
+++ b/collectors/cpustatMetric.go
@@ -30,6 +30,7 @@ type CpustatCollector struct {
 func (m *CpustatCollector) Init(config json.RawMessage) error {
 	m.name = "CpustatCollector"
 	m.setup()
+	m.parallel = true
 	m.meta = map[string]string{"source": m.name, "group": "CPU", "unit": "Percent"}
 	m.nodetags = map[string]string{"type": "node"}
 	if len(config) > 0 {
@@ -82,7 +83,7 @@ func (m *CpustatCollector) Init(config json.RawMessage) error {
 		if strings.HasPrefix(linefields[0], "cpu") && strings.Compare(linefields[0], "cpu") != 0 {
 			cpustr := strings.TrimLeft(linefields[0], "cpu")
 			cpu, _ := strconv.Atoi(cpustr)
-			m.cputags[linefields[0]] = map[string]string{"type": "cpu", "type-id": fmt.Sprintf("%d", cpu)}
+			m.cputags[linefields[0]] = map[string]string{"type": "hwthread", "type-id": fmt.Sprintf("%d", cpu)}
 			num_cpus++
 		}
 	}
--- a/collectors/customCmdMetric.go
+++ b/collectors/customCmdMetric.go
@@ -33,6 +33,7 @@ type CustomCmdCollector struct {
 func (m *CustomCmdCollector) Init(config json.RawMessage) error {
 	var err error
 	m.name = "CustomCmdCollector"
+	m.parallel = true
 	m.meta = map[string]string{"source": m.name, "group": "Custom"}
 	if len(config) > 0 {
 		err = json.Unmarshal(config, &m.config)
--- a/collectors/diskstatMetric.go
+++ b/collectors/diskstatMetric.go
@@ -29,6 +29,7 @@ type DiskstatCollector struct {

 func (m *DiskstatCollector) Init(config json.RawMessage) error {
 	m.name = "DiskstatCollector"
+	m.parallel = true
 	m.meta = map[string]string{"source": m.name, "group": "Disk"}
 	m.setup()
 	if len(config) > 0 {
@@ -77,11 +78,18 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMetric
 			continue
 		}
 		path := strings.Replace(linefields[1], `\040`, " ", -1)
-		stat := syscall.Statfs_t{}
+		stat := syscall.Statfs_t{
+			Blocks: 0,
+			Bsize:  0,
+			Bfree:  0,
+		}
 		err := syscall.Statfs(path, &stat)
 		if err != nil {
 			continue
 		}
+		if stat.Blocks == 0 || stat.Bsize == 0 {
+			continue
+		}
 		tags := map[string]string{"type": "node", "device": linefields[0]}
 		total := (stat.Blocks * uint64(stat.Bsize)) / uint64(1000000000)
 		y, err := lp.New("disk_total", tags, m.meta, map[string]interface{}{"value": total}, time.Now())
@@ -95,9 +103,11 @@ func (m *DiskstatCollector) Read(interval time.Duration, output chan lp.CCMetric
 			y.AddMeta("unit", "GBytes")
 			output <- y
 		}
-		perc := (100 * (total - free)) / total
-		if perc > part_max_used {
-			part_max_used = perc
+		if total > 0 {
+			perc := (100 * (total - free)) / total
+			if perc > part_max_used {
+				part_max_used = perc
+			}
 		}
 	}
 	y, err := lp.New("part_max_used", map[string]string{"type": "node"}, m.meta, map[string]interface{}{"value": int(part_max_used)}, time.Now())
--- a/collectors/gpfsMetric.go
+++ b/collectors/gpfsMetric.go
@@ -46,6 +46,7 @@ func (m *GpfsCollector) Init(config json.RawMessage) error {
 	var err error
 	m.name = "GpfsCollector"
 	m.setup()
+	m.parallel = true

 	// Set default mmpmon binary
 	m.config.Mmpmon = DEFAULT_GPFS_CMD
--- a/collectors/infinibandMetric.go
+++ b/collectors/infinibandMetric.go
@@ -54,6 +54,7 @@ func (m *InfinibandCollector) Init(config json.RawMessage) error {
 	var err error
 	m.name = "InfinibandCollector"
 	m.setup()
+	m.parallel = true
 	m.meta = map[string]string{
 		"source": m.name,
 		"group":  "Network",
--- a/collectors/iostatMetric.go
+++ b/collectors/iostatMetric.go
@@ -37,6 +37,7 @@ type IOstatCollector struct {
 func (m *IOstatCollector) Init(config json.RawMessage) error {
 	var err error
 	m.name = "IOstatCollector"
+	m.parallel = true
 	m.meta = map[string]string{"source": m.name, "group": "Disk"}
 	m.setup()
 	if len(config) > 0 {
--- a/collectors/ipmiMetric.go
+++ b/collectors/ipmiMetric.go
@@ -34,6 +34,7 @@ type IpmiCollector struct {
 func (m *IpmiCollector) Init(config json.RawMessage) error {
 	m.name = "IpmiCollector"
 	m.setup()
+	m.parallel = true
 	m.meta = map[string]string{"source": m.name, "group": "IPMI"}
 	m.config.IpmitoolPath = string(IPMITOOL_PATH)
 	m.config.IpmisensorsPath = string(IPMISENSORS_PATH)
--- a/collectors/likwidMetric.go
+++ b/collectors/likwidMetric.go
@@ -177,6 +177,7 @@ func getBaseFreq() float64 {

 func (m *LikwidCollector) Init(config json.RawMessage) error {
 	m.name = "LikwidCollector"
+	m.parallel = false
 	m.initialized = false
 	m.running = false
 	m.config.AccessMode = LIKWID_DEF_ACCESSMODE
@@ -204,7 +205,7 @@ func (m *LikwidCollector) Init(config json.RawMessage) error {

 	m.meta = map[string]string{"group": "PerfCounter"}
 	cclog.ComponentDebug(m.name, "Get cpulist and init maps and lists")
-	cpulist := topo.CpuList()
+	cpulist := topo.HwthreadList()
 	m.cpulist = make([]C.int, len(cpulist))
 	m.cpu2tid = make(map[int]int)
 	for i, c := range cpulist {
--- a/collectors/likwidMetric.md
+++ b/collectors/likwidMetric.md
@@ -19,7 +19,7 @@ The `likwid` collector is probably the most complicated collector. The LIKWID li
            "calc": "COUNTER0 + COUNTER1",
            "publish": false,
            "unit": "myunit",
-            "type": "cpu"
+            "type": "hwthread"
          }
        ]
      }
@@ -30,7 +30,7 @@ The `likwid` collector is probably the most complicated collector. The LIKWID li
        "calc": "sum_01",
        "publish": true,
        "unit": "myunit",
-        "type": "cpu"
+        "type": "hwthread"
      }
    ]
  }
@@ -51,15 +51,15 @@ Additional options:

 Hardware performance counters are scattered all over the system nowadays. A counter coveres a specific part of the system. While there are hardware thread specific counter for CPU cycles, instructions and so on, some others are specific for a whole CPU socket/package. To address that, the LikwidCollector provides the specification of a `type` for each metric.

- `cpu` : One metric per CPU hardware thread with the tags `"type" : "cpu"` and `"type-id" : "$cpu_id"`
+- `hwthread` : One metric per CPU hardware thread with the tags `"type" : "hwthread"` and `"type-id" : "$hwthread_id"`
 - `socket` : One metric per CPU socket/package with the tags `"type" : "socket"` and `"type-id" : "$socket_id"`

-**Note:** You should not specify the `socket` type for a metric that is measured at `cpu` scope and vice versa, so some kind of expert knowledge or lookup work in the [Likwid Wiki](https://github.com/RRZE-HPC/likwid/wiki) is required. Get the scope of each counter from the *Architecture* pages and as soon as one counter in a metric is socket-specific, the whole metric is socket-specific.
+**Note:** You cannot specify `socket` scope for a metric that is measured at `hwthread` scope, so some kind of expert knowledge or lookup work in the [Likwid Wiki](https://github.com/RRZE-HPC/likwid/wiki) is required. Get the scope of each counter from the *Architecture* pages and as soon as one counter in a metric is socket-specific, the whole metric is socket-specific.

 As a guideline:
- All counters `FIXCx`, `PMCy` and `TMAz` have the scope `cpu`
+- All counters `FIXCx`, `PMCy` and `TMAz` have the scope `hwthread`
 - All counters names containing `BOX` have the scope `socket`
- All `PWRx` counters have scope `socket`, except `"PWR1" : "RAPL_CORE_ENERGY"` has `cpu` scope (AMD Zen)
+- All `PWRx` counters have scope `socket`, except `"PWR1" : "RAPL_CORE_ENERGY"` has `hwthread` scope
 - All `DFCx` counters have scope `socket`

 ### Help with the configuration
@@ -90,7 +90,7 @@ $ scripts/likwid_perfgroup_to_cc_config.py ICX MEM_DP
      "name": "Runtime (RDTSC) [s]",
      "publish": true,
      "unit": "seconds"
-      "scope": "cpu"
+      "scope": "hwthread"
    },
    {
      "..." : "..."
@@ -147,20 +147,20 @@ One might think this does not happen often but often used metrics in the world o
          {
            "name": "ipc",
            "calc": "PMC0/PMC1",
-            "type": "cpu",
+            "type": "hwthread",
            "publish": true
          },
          {
            "name": "flops_any",
            "calc": "0.000001*PMC2/time",
            "unit": "MFlops/s",
-            "type": "cpu",
+            "type": "hwthread",
            "publish": true
          },
          {
            "name": "clock",
            "calc": "0.000001*(FIXC1/FIXC2)/inverseClock",
-            "type": "cpu",
+            "type": "hwthread",
            "unit": "MHz",
            "publish": true
          },
@@ -219,3 +219,33 @@ One might think this does not happen often but often used metrics in the world o
  }
 ```

+### How to get the eventsets and metrics from LIKWID
+
+The `likwid` collector reads hardware performance counters at a **hwthread** and **socket** level. The configuration looks quite complicated but it is basically copy&paste from [LIKWID's performance groups](https://github.com/RRZE-HPC/likwid/tree/master/groups). The collector made multiple iterations and tried to use the performance groups but it lacked flexibility. The current way of configuration provides most flexibility.
+
+The logic is as following: There are multiple eventsets, each consisting of a list of counters+events and a list of metrics. If you compare a common performance group with the example setting above, there is not much difference:
+```
+EVENTSET                         ->   "events": {
+FIXC1 ACTUAL_CPU_CLOCK           ->     "FIXC1": "ACTUAL_CPU_CLOCK",
+FIXC2 MAX_CPU_CLOCK              ->     "FIXC2": "MAX_CPU_CLOCK",
+PMC0  RETIRED_INSTRUCTIONS       ->     "PMC0" : "RETIRED_INSTRUCTIONS",
+PMC1  CPU_CLOCKS_UNHALTED        ->     "PMC1" : "CPU_CLOCKS_UNHALTED",
+PMC2  RETIRED_SSE_AVX_FLOPS_ALL  ->     "PMC2": "RETIRED_SSE_AVX_FLOPS_ALL",
+PMC3  MERGE                      ->     "PMC3": "MERGE",
+                                 ->   }
+```
+
+The metrics are following the same procedure:
+
+```
+METRICS                          ->   "metrics": [
+IPC   PMC0/PMC1                  ->     {
+                                 ->       "name" : "IPC",
+                                 ->       "calc" : "PMC0/PMC1",
+                                 ->       "scope": "hwthread",
+                                 ->       "publish": true
+                                 ->     }
+                                 ->   ]
+```
+
+The script `scripts/likwid_perfgroup_to_cc_config.py` might help you.
--- a/collectors/loadavgMetric.go
+++ b/collectors/loadavgMetric.go
@@ -36,6 +36,7 @@ type LoadavgCollector struct {

 func (m *LoadavgCollector) Init(config json.RawMessage) error {
 	m.name = "LoadavgCollector"
+	m.parallel = true
 	m.setup()
 	if len(config) > 0 {
 		err := json.Unmarshal(config, &m.config)
--- a/collectors/lustreMetric.go
+++ b/collectors/lustreMetric.go
@@ -288,6 +288,7 @@ var LustreDeriveMetrics = []LustreMetricDefinition{
 func (m *LustreCollector) Init(config json.RawMessage) error {
 	var err error
 	m.name = "LustreCollector"
+	m.parallel = true
 	if len(config) > 0 {
 		err = json.Unmarshal(config, &m.config)
 		if err != nil {
--- a/collectors/memstatMetric.go
+++ b/collectors/memstatMetric.go
@@ -81,6 +81,7 @@ func getStats(filename string) map[string]MemstatStats {
 func (m *MemstatCollector) Init(config json.RawMessage) error {
 	var err error
 	m.name = "MemstatCollector"
+	m.parallel = true
 	m.config.NodeStats = true
 	m.config.NumaStats = false
 	if len(config) > 0 {
@@ -159,6 +160,7 @@ func (m *MemstatCollector) Init(config json.RawMessage) error {

 func (m *MemstatCollector) Read(interval time.Duration, output chan lp.CCMetric) {
 	if !m.init {
+		cclog.ComponentPrint(m.name, "Here")
 		return
 	}

--- a/collectors/metricCollector.go
+++ b/collectors/metricCollector.go
@@ -3,27 +3,25 @@ package collectors
 import (
 	"encoding/json"
 	"fmt"
-	"io/ioutil"
-	"log"
-	"strconv"
-	"strings"
 	"time"

 	lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
 )

 type MetricCollector interface {
-	Name() string                                         // Name of the metric collector
-	Init(config json.RawMessage) error                    // Initialize metric collector
-	Initialized() bool                                    // Is metric collector initialized?
+	Name() string                      // Name of the metric collector
+	Init(config json.RawMessage) error // Initialize metric collector
+	Initialized() bool                 // Is metric collector initialized?
+	Parallel() bool
 	Read(duration time.Duration, output chan lp.CCMetric) // Read metrics from metric collector
 	Close()                                               // Close / finish metric collector
 }

 type metricCollector struct {
-	name string            // name of the metric
-	init bool              // is metric collector initialized?
-	meta map[string]string // static meta data tags
+	name     string            // name of the metric
+	init     bool              // is metric collector initialized?
+	parallel bool              // can the metric collector be executed in parallel with others
+	meta     map[string]string // static meta data tags
 }

 // Name returns the name of the metric collector
@@ -31,6 +29,11 @@ func (c *metricCollector) Name() string {
 	return c.name
 }

+// Name returns the name of the metric collector
+func (c *metricCollector) Parallel() bool {
+	return c.parallel
+}
+
 // Setup is for future use
 func (c *metricCollector) setup() error {
 	return nil
@@ -65,58 +68,6 @@ func stringArrayContains(array []string, str string) (int, bool) {
 	return -1, false
 }

-// SocketList returns the list of physical sockets as read from /proc/cpuinfo
-func SocketList() []int {
-	buffer, err := ioutil.ReadFile("/proc/cpuinfo")
-	if err != nil {
-		log.Print(err)
-		return nil
-	}
-	ll := strings.Split(string(buffer), "\n")
-	var packs []int
-	for _, line := range ll {
-		if strings.HasPrefix(line, "physical id") {
-			lv := strings.Fields(line)
-			id, err := strconv.ParseInt(lv[3], 10, 32)
-			if err != nil {
-				log.Print(err)
-				return packs
-			}
-			_, found := intArrayContains(packs, int(id))
-			if !found {
-				packs = append(packs, int(id))
-			}
-		}
-	}
-	return packs
-}
-
-// CpuList returns the list of physical CPUs (in contrast to logical CPUs) as read from /proc/cpuinfo
-func CpuList() []int {
-	buffer, err := ioutil.ReadFile("/proc/cpuinfo")
-	if err != nil {
-		log.Print(err)
-		return nil
-	}
-	ll := strings.Split(string(buffer), "\n")
-	var cpulist []int
-	for _, line := range ll {
-		if strings.HasPrefix(line, "processor") {
-			lv := strings.Fields(line)
-			id, err := strconv.ParseInt(lv[2], 10, 32)
-			if err != nil {
-				log.Print(err)
-				return cpulist
-			}
-			_, found := intArrayContains(cpulist, int(id))
-			if !found {
-				cpulist = append(cpulist, int(id))
-			}
-		}
-	}
-	return cpulist
-}
-
 // RemoveFromStringList removes the string r from the array of strings s
 // If r is not contained in the array an error is returned
 func RemoveFromStringList(s []string, r string) ([]string, error) {
--- a/collectors/netstatMetric.go
+++ b/collectors/netstatMetric.go
@@ -39,6 +39,7 @@ type NetstatCollector struct {

 func (m *NetstatCollector) Init(config json.RawMessage) error {
 	m.name = "NetstatCollector"
+	m.parallel = true
 	m.setup()
 	m.lastTimestamp = time.Now()

--- a/collectors/nfsMetric.go
+++ b/collectors/nfsMetric.go
@@ -114,6 +114,7 @@ func (m *nfsCollector) MainInit(config json.RawMessage) error {
 	m.data = make(map[string]NfsCollectorData)
 	m.initStats()
 	m.init = true
+	m.parallel = true
 	return nil
 }

--- a/collectors/numastatsMetric.go
+++ b/collectors/numastatsMetric.go
@@ -54,6 +54,7 @@ func (m *NUMAStatsCollector) Init(config json.RawMessage) error {
 	}

 	m.name = "NUMAStatsCollector"
+	m.parallel = true
 	m.setup()
 	m.meta = map[string]string{
 		"source": m.name,
--- a/collectors/nvidiaMetric.go
+++ b/collectors/nvidiaMetric.go
--- a/collectors/nvidiaMetric.md
+++ b/collectors/nvidiaMetric.md
@@ -3,38 +3,74 @@

 ```json
  "nvidia": {
-    "exclude_devices" : [
-      "0","1"
+    "exclude_devices": [
+      "0","1", "0000000:ff:01.0"
    ],
    "exclude_metrics": [
-      "nv_fb_memory",
+      "nv_fb_mem_used",
      "nv_fan"
-    ]
+    ],
+    "process_mig_devices": false,
+    "use_pci_info_as_type_id": true,
+    "add_pci_info_tag": false,
+    "add_uuid_meta": false,
+    "add_board_number_meta": false,
+    "add_serial_meta": false,
+    "use_uuid_for_mig_device": false,
+    "use_slice_for_mig_device": false
  }
 ```

+The `nvidia` collector can be configured to leave out specific devices with the `exclude_devices` option. It takes IDs as supplied to the NVML with `nvmlDeviceGetHandleByIndex()` or the PCI address in NVML format (`%08X:%02X:%02X.0`). Metrics (listed below) that should not be sent to the MetricRouter can be excluded with the `exclude_metrics` option. Commonly only the physical GPUs are monitored. If MIG devices should be analyzed as well, set `process_mig_devices` (adds `stype=mig,stype-id=<mig_index>`). With the options `use_uuid_for_mig_device` and `use_slice_for_mig_device`, the `<mig_index>` can be replaced with the UUID (e.g. `MIG-6a9f7cc8-6d5b-5ce0-92de-750edc4d8849`) or the MIG slice name (e.g. `1g.5gb`).
+
+The metrics sent by the `nvidia` collector use `accelerator` as `type` tag. For the `type-id`, it uses the device handle index by default. With the `use_pci_info_as_type_id` option, the PCI ID is used instead. If both values should be added as tags, activate the `add_pci_info_tag` option. It uses the device handle index as `type-id` and adds the PCI ID as separate `pci_identifier` tag.
+
+Optionally, it is possible to add the UUID, the board part number and the serial to the meta informations. They are not sent to the sinks (if not configured otherwise).
+
+
 Metrics:
 * `nv_util`
 * `nv_mem_util`
-* `nv_mem_total`
-* `nv_fb_memory`
+* `nv_fb_mem_total`
+* `nv_fb_mem_used`
+* `nv_bar1_mem_total`
+* `nv_bar1_mem_used`
 * `nv_temp`
 * `nv_fan`
 * `nv_ecc_mode`
 * `nv_perf_state`
-* `nv_power_usage_report`
-* `nv_graphics_clock_report`
-* `nv_sm_clock_report`
-* `nv_mem_clock_report`
+* `nv_power_usage`
+* `nv_graphics_clock`
+* `nv_sm_clock`
+* `nv_mem_clock`
+* `nv_video_clock`
 * `nv_max_graphics_clock`
 * `nv_max_sm_clock`
 * `nv_max_mem_clock`
-* `nv_ecc_db_error`
-* `nv_ecc_sb_error`
-* `nv_power_man_limit`
+* `nv_max_video_clock`
+* `nv_ecc_uncorrected_error`
+* `nv_ecc_corrected_error`
+* `nv_power_max_limit`
 * `nv_encoder_util`
 * `nv_decoder_util`
+* `nv_remapped_rows_corrected`
+* `nv_remapped_rows_uncorrected`
+* `nv_remapped_rows_pending`
+* `nv_remapped_rows_failure`
+* `nv_compute_processes`
+* `nv_graphics_processes`
+* `nv_violation_power`
+* `nv_violation_thermal`
+* `nv_violation_sync_boost`
+* `nv_violation_board_limit`
+* `nv_violation_low_util`
+* `nv_violation_reliability`
+* `nv_violation_below_app_clock`
+* `nv_violation_below_base_clock`
+* `nv_nvlink_crc_flit_errors`
+* `nv_nvlink_crc_errors`
+* `nv_nvlink_ecc_errors`
+* `nv_nvlink_replay_errors`
+* `nv_nvlink_recovery_errors`

-It uses a separate `type` in the metrics. The output metric looks like this:
-`<name>,type=accelerator,type-id=<nvidia-gpu-id> value=<metric value> <timestamp>`
-
+Some metrics add the additional sub type tag (`stype`) like the `nv_nvlink_*` metrics set `stype=nvlink,stype-id=<link_number>`. 
--- a/collectors/rocmsmiMetric.go
+++ b/collectors/rocmsmiMetric.go
@@ -0,0 +1,319 @@
+package collectors
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"time"
+
+	cclog "github.com/ClusterCockpit/cc-metric-collector/internal/ccLogger"
+	lp "github.com/ClusterCockpit/cc-metric-collector/internal/ccMetric"
+	"github.com/ClusterCockpit/go-rocm-smi/pkg/rocm_smi"
+)
+
+type RocmSmiCollectorConfig struct {
+	ExcludeMetrics     []string `json:"exclude_metrics,omitempty"`
+	ExcludeDevices     []string `json:"exclude_devices,omitempty"`
+	AddPciInfoTag      bool     `json:"add_pci_info_tag,omitempty"`
+	UsePciInfoAsTypeId bool     `json:"use_pci_info_as_type_id,omitempty"`
+	AddSerialMeta      bool     `json:"add_serial_meta,omitempty"`
+}
+
+type RocmSmiCollectorDevice struct {
+	device         rocm_smi.DeviceHandle
+	index          int
+	tags           map[string]string // default tags
+	meta           map[string]string // default meta information
+	excludeMetrics map[string]bool   // copy of exclude metrics from config
+}
+
+type RocmSmiCollector struct {
+	metricCollector
+	config  RocmSmiCollectorConfig // the configuration structure
+	devices []RocmSmiCollectorDevice
+}
+
+// Functions to implement MetricCollector interface
+// Init(...), Read(...), Close()
+// See: metricCollector.go
+
+// Init initializes the sample collector
+// Called once by the collector manager
+// All tags, meta data tags and metrics that do not change over the runtime should be set here
+func (m *RocmSmiCollector) Init(config json.RawMessage) error {
+	var err error = nil
+	// Always set the name early in Init() to use it in cclog.Component* functions
+	m.name = "RocmSmiCollector"
+	// This is for later use, also call it early
+	m.setup()
+	// Define meta information sent with each metric
+	// (Can also be dynamic or this is the basic set with extension through AddMeta())
+	//m.meta = map[string]string{"source": m.name, "group": "AMD"}
+	// Define tags sent with each metric
+	// The 'type' tag is always needed, it defines the granulatity of the metric
+	// node -> whole system
+	// socket -> CPU socket (requires socket ID as 'type-id' tag)
+	// cpu -> single CPU hardware thread (requires cpu ID as 'type-id' tag)
+	//m.tags = map[string]string{"type": "node"}
+	// Read in the JSON configuration
+	if len(config) > 0 {
+		err = json.Unmarshal(config, &m.config)
+		if err != nil {
+			cclog.ComponentError(m.name, "Error reading config:", err.Error())
+			return err
+		}
+	}
+
+	ret := rocm_smi.Init()
+	if ret != rocm_smi.STATUS_SUCCESS {
+		err = errors.New("Failed to initialize ROCm SMI library")
+		cclog.ComponentError(m.name, err.Error())
+		return err
+	}
+
+	numDevs, ret := rocm_smi.NumMonitorDevices()
+	if ret != rocm_smi.STATUS_SUCCESS {
+		err = errors.New("Failed to get number of GPUs from ROCm SMI library")
+		cclog.ComponentError(m.name, err.Error())
+		return err
+	}
+
+	exclDev := func(s string) bool {
+		skip_device := false
+		for _, excl := range m.config.ExcludeDevices {
+			if excl == s {
+				skip_device = true
+				break
+			}
+		}
+		return skip_device
+	}
+
+	m.devices = make([]RocmSmiCollectorDevice, 0)
+
+	for i := 0; i < numDevs; i++ {
+		str_i := fmt.Sprintf("%d", i)
+		if exclDev(str_i) {
+			continue
+		}
+		device, ret := rocm_smi.DeviceGetHandleByIndex(i)
+		if ret != rocm_smi.STATUS_SUCCESS {
+			err = fmt.Errorf("Failed to get handle for GPU %d", i)
+			cclog.ComponentError(m.name, err.Error())
+			return err
+		}
+
+		pciInfo, ret := rocm_smi.DeviceGetPciInfo(device)
+		if ret != rocm_smi.STATUS_SUCCESS {
+			err = fmt.Errorf("Failed to get PCI information for GPU %d", i)
+			cclog.ComponentError(m.name, err.Error())
+			return err
+		}
+
+		pciId := fmt.Sprintf(
+			"%08X:%02X:%02X.%X",
+			pciInfo.Domain,
+			pciInfo.Bus,
+			pciInfo.Device,
+			pciInfo.Function)
+
+		if exclDev(pciId) {
+			continue
+		}
+
+		dev := RocmSmiCollectorDevice{
+			device: device,
+			tags: map[string]string{
+				"type":    "accelerator",
+				"type-id": str_i,
+			},
+			meta: map[string]string{
+				"source": m.name,
+				"group":  "AMD",
+			},
+		}
+		if m.config.UsePciInfoAsTypeId {
+			dev.tags["type-id"] = pciId
+		} else if m.config.AddPciInfoTag {
+			dev.tags["pci_identifier"] = pciId
+		}
+
+		if m.config.AddSerialMeta {
+			serial, ret := rocm_smi.DeviceGetSerialNumber(device)
+			if ret != rocm_smi.STATUS_SUCCESS {
+				cclog.ComponentError(m.name, "Unable to get serial number for device at index", i, ":", rocm_smi.StatusStringNoError(ret))
+			} else {
+				dev.meta["serial"] = serial
+			}
+		}
+		// Add excluded metrics
+		dev.excludeMetrics = map[string]bool{}
+		for _, e := range m.config.ExcludeMetrics {
+			dev.excludeMetrics[e] = true
+		}
+		dev.index = i
+		m.devices = append(m.devices, dev)
+	}
+
+	// Set this flag only if everything is initialized properly, all required files exist, ...
+	m.init = true
+	return err
+}
+
+// Read collects all metrics belonging to the sample collector
+// and sends them through the output channel to the collector manager
+func (m *RocmSmiCollector) Read(interval time.Duration, output chan lp.CCMetric) {
+	// Create a sample metric
+	timestamp := time.Now()
+
+	for _, dev := range m.devices {
+		metrics, ret := rocm_smi.DeviceGetMetrics(dev.device)
+		if ret != rocm_smi.STATUS_SUCCESS {
+			cclog.ComponentError(m.name, "Unable to get metrics for device at index", dev.index, ":", rocm_smi.StatusStringNoError(ret))
+			continue
+		}
+
+		if !dev.excludeMetrics["rocm_gfx_util"] {
+			value := metrics.Average_gfx_activity
+			y, err := lp.New("rocm_gfx_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
+			if err == nil {
+				output <- y
+			}
+		}
+		if !dev.excludeMetrics["rocm_umc_util"] {
+			value := metrics.Average_umc_activity
+			y, err := lp.New("rocm_umc_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
+			if err == nil {
+				output <- y
+			}
+		}
+		if !dev.excludeMetrics["rocm_mm_util"] {
+			value := metrics.Average_mm_activity
+			y, err := lp.New("rocm_mm_util", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
+			if err == nil {
+				output <- y
+			}
+		}
+		if !dev.excludeMetrics["rocm_avg_power"] {
+			value := metrics.Average_socket_power
+			y, err := lp.New("rocm_avg_power", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
+			if err == nil {
+				output <- y
+			}
+		}
+		if !dev.excludeMetrics["rocm_temp_mem"] {
+			value := metrics.Temperature_mem
+			y, err := lp.New("rocm_temp_mem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
+			if err == nil {
+				output <- y
+			}
+		}
+		if !dev.excludeMetrics["rocm_temp_hotspot"] {
+			value := metrics.Temperature_hotspot
+			y, err := lp.New("rocm_temp_hotspot", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
+			if err == nil {
+				output <- y
+			}
+		}
+		if !dev.excludeMetrics["rocm_temp_edge"] {
+			value := metrics.Temperature_edge
+			y, err := lp.New("rocm_temp_edge", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
+			if err == nil {
+				output <- y
+			}
+		}
+		if !dev.excludeMetrics["rocm_temp_vrgfx"] {
+			value := metrics.Temperature_vrgfx
+			y, err := lp.New("rocm_temp_vrgfx", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
+			if err == nil {
+				output <- y
+			}
+		}
+		if !dev.excludeMetrics["rocm_temp_vrsoc"] {
+			value := metrics.Temperature_vrsoc
+			y, err := lp.New("rocm_temp_vrsoc", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
+			if err == nil {
+				output <- y
+			}
+		}
+		if !dev.excludeMetrics["rocm_temp_vrmem"] {
+			value := metrics.Temperature_vrmem
+			y, err := lp.New("rocm_temp_vrmem", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
+			if err == nil {
+				output <- y
+			}
+		}
+		if !dev.excludeMetrics["rocm_gfx_clock"] {
+			value := metrics.Average_gfxclk_frequency
+			y, err := lp.New("rocm_gfx_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
+			if err == nil {
+				output <- y
+			}
+		}
+		if !dev.excludeMetrics["rocm_soc_clock"] {
+			value := metrics.Average_socclk_frequency
+			y, err := lp.New("rocm_soc_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
+			if err == nil {
+				output <- y
+			}
+		}
+		if !dev.excludeMetrics["rocm_u_clock"] {
+			value := metrics.Average_uclk_frequency
+			y, err := lp.New("rocm_u_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
+			if err == nil {
+				output <- y
+			}
+		}
+		if !dev.excludeMetrics["rocm_v0_clock"] {
+			value := metrics.Average_vclk0_frequency
+			y, err := lp.New("rocm_v0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
+			if err == nil {
+				output <- y
+			}
+		}
+		if !dev.excludeMetrics["rocm_v1_clock"] {
+			value := metrics.Average_vclk1_frequency
+			y, err := lp.New("rocm_v1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
+			if err == nil {
+				output <- y
+			}
+		}
+		if !dev.excludeMetrics["rocm_d0_clock"] {
+			value := metrics.Average_dclk0_frequency
+			y, err := lp.New("rocm_d0_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
+			if err == nil {
+				output <- y
+			}
+		}
+		if !dev.excludeMetrics["rocm_d1_clock"] {
+			value := metrics.Average_dclk1_frequency
+			y, err := lp.New("rocm_d1_clock", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
+			if err == nil {
+				output <- y
+			}
+		}
+		if !dev.excludeMetrics["rocm_temp_hbm"] {
+			for i := 0; i < rocm_smi.NUM_HBM_INSTANCES; i++ {
+				value := metrics.Temperature_hbm[i]
+				y, err := lp.New("rocm_temp_hbm", dev.tags, dev.meta, map[string]interface{}{"value": value}, timestamp)
+				if err == nil {
+					y.AddTag("stype", "device")
+					y.AddTag("stype-id", fmt.Sprintf("%d", i))
+					output <- y
+				}
+			}
+		}
+	}
+
+}
+
+// Close metric collector: close network connection, close files, close libraries, ...
+// Called once by the collector manager
+func (m *RocmSmiCollector) Close() {
+	// Unset flag
+	ret := rocm_smi.Shutdown()
+	if ret != rocm_smi.STATUS_SUCCESS {
+		cclog.ComponentError(m.name, "Failed to shutdown ROCm SMI library")
+	}
+	m.init = false
+}
--- a/collectors/rocmsmiMetric.md
+++ b/collectors/rocmsmiMetric.md
@@ -0,0 +1,47 @@
+
+## `rocm_smi` collector
+
+```json
+  "rocm_smi": {
+    "exclude_devices": [
+      "0","1", "0000000:ff:01.0"
+    ],
+    "exclude_metrics": [
+      "rocm_mm_util",
+      "rocm_temp_vrsoc"
+    ],
+    "use_pci_info_as_type_id": true,
+    "add_pci_info_tag": false,
+    "add_serial_meta": false,
+  }
+```
+
+The `rocm_smi` collector can be configured to leave out specific devices with the `exclude_devices` option. It takes logical IDs in the list of available devices or the PCI address similar to NVML format (`%08X:%02X:%02X.0`). Metrics (listed below) that should not be sent to the MetricRouter can be excluded with the `exclude_metrics` option. 
+
+The metrics sent by the `rocm_smi` collector use `accelerator` as `type` tag. For the `type-id`, it uses the device handle index by default. With the `use_pci_info_as_type_id` option, the PCI ID is used instead. If both values should be added as tags, activate the `add_pci_info_tag` option. It uses the device handle index as `type-id` and adds the PCI ID as separate `pci_identifier` tag.
+
+Optionally, it is possible to add the serial to the meta informations. They are not sent to the sinks (if not configured otherwise).
+
+
+Metrics:
+* `rocm_gfx_util`
+* `rocm_umc_util`
+* `rocm_mm_util`
+* `rocm_avg_power`
+* `rocm_temp_mem`
+* `rocm_temp_hotspot`
+* `rocm_temp_edge`
+* `rocm_temp_vrgfx`
+* `rocm_temp_vrsoc`
+* `rocm_temp_vrmem`
+* `rocm_gfx_clock`
+* `rocm_soc_clock`
+* `rocm_u_clock`
+* `rocm_v0_clock`
+* `rocm_v1_clock`
+* `rocm_d0_clock`
+* `rocm_d1_clock`
+* `rocm_temp_hbm`
+
+
+Some metrics add the additional sub type tag (`stype`) like the `rocm_temp_hbm` metrics set `stype=device,stype-id=<HBM_slice_number>`. 
--- a/collectors/sampleMetric.go
+++ b/collectors/sampleMetric.go
@@ -35,6 +35,10 @@ func (m *SampleCollector) Init(config json.RawMessage) error {
 	m.name = "InternalCollector"
 	// This is for later use, also call it early
 	m.setup()
+	// Tell whether the collector should be run in parallel with others (reading files, ...)
+	// or it should be run serially, mostly for collectors acutally doing measurements
+	// because they should not measure the execution of the other collectors
+	m.parallel = true
 	// Define meta information sent with each metric
 	// (Can also be dynamic or this is the basic set with extension through AddMeta())
 	m.meta = map[string]string{"source": m.name, "group": "SAMPLE"}
@@ -42,7 +46,12 @@ func (m *SampleCollector) Init(config json.RawMessage) error {
 	// The 'type' tag is always needed, it defines the granulatity of the metric
 	// node -> whole system
 	// socket -> CPU socket (requires socket ID as 'type-id' tag)
-	// cpu -> single CPU hardware thread (requires cpu ID as 'type-id' tag)
+	// die -> CPU die (requires CPU die ID as 'type-id' tag)
+	// memoryDomain -> NUMA domain (requires NUMA domain ID as 'type-id' tag)
+	// llc -> Last level cache (requires last level cache ID as 'type-id' tag)
+	// core -> single CPU core that may consist of multiple hardware threads (SMT) (requires core ID as 'type-id' tag)
+	// hwthtread -> single CPU hardware thread (requires hardware thread ID as 'type-id' tag)
+	// accelerator -> A accelerator device like GPU or FPGA (requires an accelerator ID as 'type-id' tag)
 	m.tags = map[string]string{"type": "node"}
 	// Read in the JSON configuration
 	if len(config) > 0 {
--- a/collectors/tempMetric.go
+++ b/collectors/tempMetric.go
@@ -50,6 +50,7 @@ func (m *TempCollector) Init(config json.RawMessage) error {
 	}

 	m.name = "TempCollector"
+	m.parallel = true
 	m.setup()
 	if len(config) > 0 {
 		err := json.Unmarshal(config, &m.config)
@@ -116,6 +117,10 @@ func (m *TempCollector) Init(config json.RawMessage) error {
 		}

 		// Sensor file
+		_, err = ioutil.ReadFile(file)
+		if err != nil {
+			continue
+		}
 		sensor.file = file

 		// Sensor tags
--- a/collectors/topprocsMetric.go
+++ b/collectors/topprocsMetric.go
@@ -28,6 +28,7 @@ type TopProcsCollector struct {
 func (m *TopProcsCollector) Init(config json.RawMessage) error {
 	var err error
 	m.name = "TopProcsCollector"
+	m.parallel = true
 	m.tags = map[string]string{"type": "node"}
 	m.meta = map[string]string{"source": m.name, "group": "TopProcs"}
 	if len(config) > 0 {